In [None]:
import requests
from bs4 import BeautifulSoup
import uuid
import time
import pandas as pd
from urllib.parse import urljoin

def scrape_delfi_lt(categories):
    max_articles = 5000
    max_per_category = 2000
    max_pages = 100
    print("Starting web scraping...")
    articles = []
    base_url = "https://www.delfi.lt"
    seen_urls = set()

    for category in categories:
        page = 1
        category_count = 0
        print(f"\nStarting category: {category}")

        while category_count < max_per_category and len(articles) < max_articles and page <= max_pages:
            url = f"{base_url}/en/{category}{f'?page={page}' if page > 1 else ''}"
            print(f"Scraping page {page}: {url}")

            try:
                response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
                soup = BeautifulSoup(response.content, 'html.parser')
            except Exception as e:
                print(f"Failed to fetch {url}: {str(e)}")
                break

            # Loosen article filtering to just matching category
            article_links = [
                urljoin(base_url, link['href'])
                for link in soup.find_all('a', href=True)
                if f'/en/{category}/' in link['href']
            ]

            print(f"Found {len(article_links)} links on page {page}")

            if not article_links:
                print(f"No more article links on page {page}. Moving to next category.")
                break

            for article_url in article_links:
                if category_count >= max_per_category or len(articles) >= max_articles:
                    break
                if article_url in seen_urls:
                    continue
                seen_urls.add(article_url)

                try:
                    article_response = requests.get(article_url, headers={'User-Agent': 'Mozilla/5.0'})
                    article_soup = BeautifulSoup(article_response.content, 'html.parser')

                    title_tag = article_soup.find('h1') or article_soup.find('h2')

                    # Try multiple containers for article content
                    content_container = (
                        article_soup.find('div', class_='delfi-article-body') or
                        article_soup.find('div', class_='article__body') or
                        article_soup.find('article') or
                        article_soup.find('div', {'itemprop': 'articleBody'})
                    )

                    if title_tag and content_container:
                        paragraphs = content_container.find_all('p')
                        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

                        if content.strip():
                            article_data = {
                                'id': str(uuid.uuid4()),
                                'title': title_tag.get_text(strip=True),
                                'content': content,
                                'url': article_url,
                                'category': category
                            }
                            articles.append(article_data)
                            category_count += 1
                            print(f" Collected: {article_data['title']}")

                except Exception as e:
                    print(f" Failed to scrape article {article_url}: {str(e)}")
                    continue

            page += 1
            time.sleep(1)

    print(f"\n Scraping complete. Total articles collected: {len(articles)}")
    return articles



In [None]:
# Run scraper
scraped_articles = scrape_delfi_lt(["politics", "business", "sports", "culture", "lifestyle"])
print(f"Collected: {len(scraped_articles)} articles total")

# Save to CSV
if scraped_articles:
    df = pd.DataFrame(scraped_articles)
    df.to_csv("delfi_articles.csv", index=False)
    print("Saved to delfi_articles.csv ")
else:
    print("No articles found ")

In [None]:
# Zip directories
!zip -r /content/delfi_articles.zip /content/delfi_articles.csv

In [None]:
!unzip -q delfi_articles.zip -d delfi_articles.csv


In [None]:
import pandas as pd

# Load articles from csv file
df = pd.read_csv("/content/delfi_articles.csv/content/delfi_articles.csv")


In [None]:
!pip install -U sentence-transformers
!pip install -U transformers
!pip install -U faiss-cpu

In [None]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")


corpus = df["content"].tolist()
embeddings = model.encode(corpus, show_progress_bar=True)


In [None]:
import faiss
import numpy as np


embedding_dim = embeddings.shape[1]


index = faiss.IndexFlatL2(embedding_dim)

index.add(np.array(embeddings))

faiss.write_index(index, "delfi_articles.index")

print(" FAISS index created and saved as 'delfi_articles.index'")


In [None]:
# Zip directories
!zip -r /content/delfi_articles_index.zip /content/delfi_articles.index

In [None]:
!unzip -q delfi_articles_index.zip -d delfi_articles_index.index


In [None]:
index = faiss.read_index("/content/delfi_articles_index.index/content/delfi_articles.index")
def search_articles(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)

    print(f"\nTop {top_k} results for query: '{query}'")
    for i, idx in enumerate(indices[0]):
        print(f"{i+1}. {df.iloc[idx]['title']}")
        print(f"   {df.iloc[idx]['url']}\n")

# Example search
search_articles("Lithuania defense NATO")


In [None]:
def retrieve_context(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)


    context = "\n\n".join(df.iloc[idx]["content"] for idx in indices[0])
    return context


In [None]:
from transformers import pipeline


generator = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    result = generator(
        prompt,
        max_length=512,
        min_length=100,
        do_sample=True,
        temperature=0.8,
        truncation=True
    )
    return result[0]['generated_text']


In [None]:
query = "What is Lithuania's defense position in NATO?"


context = retrieve_context(query, top_k=3)

answer = generate_answer(query, context)

print("Generated Answer:\n")
print(answer)


In [None]:
query = "What is latest news in Business?"


context = retrieve_context(query, top_k=3)

answer = generate_answer(query, context)

print("Generated Answer:\n")
print(answer)
