Belkacem Sadi , LSP number:2431549

In [3]:
import requests
from bs4 import BeautifulSoup
import uuid
import time
import pandas as pd
from urllib.parse import urljoin

def scrape_delfi_lt(categories):
    max_articles = 5000
    max_per_category = 2000
    max_pages = 100
    print("Starting web scraping...")
    articles = []
    base_url = "https://www.delfi.lt"
    seen_urls = set()

    for category in categories:
        page = 1
        category_count = 0
        print(f"\nStarting category: {category}")

        while category_count < max_per_category and len(articles) < max_articles and page <= max_pages:
            url = f"{base_url}/en/{category}{f'?page={page}' if page > 1 else ''}"
            print(f"Scraping page {page}: {url}")

            try:
                response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
                soup = BeautifulSoup(response.content, 'html.parser')
            except Exception as e:
                print(f"Failed to fetch {url}: {str(e)}")
                break

            # Loosen article filtering to just matching category
            article_links = [
                urljoin(base_url, link['href'])
                for link in soup.find_all('a', href=True)
                if f'/en/{category}/' in link['href']
            ]

            print(f"Found {len(article_links)} links on page {page}")

            if not article_links:
                print(f"No more article links on page {page}. Moving to next category.")
                break

            for article_url in article_links:
                if category_count >= max_per_category or len(articles) >= max_articles:
                    break
                if article_url in seen_urls:
                    continue
                seen_urls.add(article_url)

                try:
                    article_response = requests.get(article_url, headers={'User-Agent': 'Mozilla/5.0'})
                    article_soup = BeautifulSoup(article_response.content, 'html.parser')

                    title_tag = article_soup.find('h1') or article_soup.find('h2')

                    # Try multiple containers for article content
                    content_container = (
                        article_soup.find('div', class_='delfi-article-body') or
                        article_soup.find('div', class_='article__body') or
                        article_soup.find('article') or
                        article_soup.find('div', {'itemprop': 'articleBody'})
                    )

                    if title_tag and content_container:
                        paragraphs = content_container.find_all('p')
                        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

                        if content.strip():
                            article_data = {
                                'id': str(uuid.uuid4()),
                                'title': title_tag.get_text(strip=True),
                                'content': content,
                                'url': article_url,
                                'category': category
                            }
                            articles.append(article_data)
                            category_count += 1
                            print(f" Collected: {article_data['title']}")

                except Exception as e:
                    print(f" Failed to scrape article {article_url}: {str(e)}")
                    continue

            page += 1
            time.sleep(1)

    print(f"\n Scraping complete. Total articles collected: {len(articles)}")
    return articles



In [4]:
# Run scraper
scraped_articles = scrape_delfi_lt(["politics", "business", "sports", "culture", "lifestyle"])
print(f"Collected: {len(scraped_articles)} articles total")

# Save to CSV
if scraped_articles:
    df = pd.DataFrame(scraped_articles)
    df.to_csv("delfi_articles.csv", index=False)
    print("Saved to delfi_articles.csv ")
else:
    print("No articles found ")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Found 60 links on page 18
 Collected: President seems to have good impression about ministerial nominees – PM-designate
 Collected: Georgia belongs in Europe – Nausėda
 Collected: Conservative party sees 10% drop in poll ratings
 Collected: DHL plane wreckage removed from crash site
 Collected: Lithuania and Rheinmetall sign deals to begin construction of modern artillery ammo plant
 Collected: Seimas speaker supports Georgian people’s aspirations to join EU
 Collected: Paluckas will consult with business on cutting of red tape – media
 Collected: MP Maldeikis joins Homeland Union
 Collected: Chinese ship suspected of dragging anchor to cut cables in Baltic Sea – media
 Collected: Government to sign land lease, ammunition purchase deals with Rheinmetall on Friday
 Collected: All ministerial nominees officially presented to Nausėda
 Collected: Rally held in Vilnius to support Belarusian man extradited from Vietnam to Minsk

In [5]:
# Zip directories
!zip -r /content/delfi_articles.zip /content/delfi_articles.csv

  adding: content/delfi_articles.csv (deflated 64%)


In [2]:
!unzip -q delfi_articles.zip -d delfi_articles.csv


In [5]:
import pandas as pd

# Load articles from csv file
df = pd.read_csv("/content/delfi_articles.csv/content/delfi_articles.csv")


In [6]:
!pip install -U sentence-transformers
!pip install -U transformers
!pip install -U faiss-cpu

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [6]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")


corpus = df["content"].tolist()
embeddings = model.encode(corpus, show_progress_bar=True)


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [7]:
import faiss
import numpy as np


embedding_dim = embeddings.shape[1]


index = faiss.IndexFlatL2(embedding_dim)

index.add(np.array(embeddings))

faiss.write_index(index, "delfi_articles.index")

print(" FAISS index created and saved as 'delfi_articles.index'")


 FAISS index created and saved as 'delfi_articles.index'


In [18]:
# Zip directories
!zip -r /content/delfi_articles_index.zip /content/delfi_articles.index

  adding: content/delfi_articles.index (deflated 7%)


In [19]:
!unzip -q delfi_articles_index.zip -d delfi_articles_index.index


In [20]:
index = faiss.read_index("/content/delfi_articles_index.index/content/delfi_articles.index")
def search_articles(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)

    print(f"\nTop {top_k} results for query: '{query}'")
    for i, idx in enumerate(indices[0]):
        print(f"{i+1}. {df.iloc[idx]['title']}")
        print(f"   {df.iloc[idx]['url']}\n")

# Example search
search_articles("Lithuania defense NATO")



Top 5 results for query: 'Lithuania defense NATO'
1. Baltic States allocating 3% of their GDP to defence are an example for other NATO members – presidents
   https://www.delfi.lt/en/politics/baltic-states-allocating-3-of-their-gdp-to-defence-are-an-example-for-other-nato-members-presidents-120070375

2. Lithuanian Navy ships begin NATO operation in Baltic Sea
   https://www.delfi.lt/en/politics/lithuanian-navy-ships-begin-nato-operation-in-baltic-sea-120079443

3. President Duda vows that Poland would defend Lithuania in case of attack
   https://www.delfi.lt/en/politics/president-duda-vows-that-poland-would-defend-lithuania-in-case-of-attack-96341405

4. Opinion poll finds NATO allies in Lithuania deter aggression
   https://www.delfi.lt/en/politics/opinion-poll-finds-nato-allies-in-lithuania-deter-aggression-96013807

5. BALTOPS24 is clear signal of NATO presence to adversary – chief of defence
   https://www.delfi.lt/en/politics/baltops24-is-clear-signal-of-nato-presence-to-advers

In [9]:
def retrieve_context(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)


    context = "\n\n".join(df.iloc[idx]["content"] for idx in indices[0])
    return context


In [14]:
from transformers import pipeline


generator = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    result = generator(
        prompt,
        max_length=512,
        min_length=100,
        do_sample=True,
        temperature=0.8,
        truncation=True
    )
    return result[0]['generated_text']


Device set to use cpu


In [15]:
query = "What is Lithuania's defense position in NATO?"


context = retrieve_context(query, top_k=3)

answer = generate_answer(query, context)

print("Generated Answer:\n")
print(answer)


Generated Answer:

NATO Operation Baltic Sentry during the month of February, but they will be deployed as support to other Baltic States.///> The Lithuanian president mentioned that the Baltic states' energy independence and resilience is essential to achieving the energy independence of the Alliance as a whole. He stressed the need for Baltic States' leadership to continue to provide unwavering support to Ukraine and support for Ukraine’s fight for freedom.//> Estonia is also among the countries that are considered to be at risk.


In [16]:
query = "What is latest news in Business?"


context = retrieve_context(query, top_k=3)

answer = generate_answer(query, context)

print("Generated Answer:\n")
print(answer)


Generated Answer:

Lithuanian Minister of Finance Valery Janulevicius, also a member of the Bank of Lithuania’s economy services, told the LRT newspaper that the economy does not need a financial miracle to survive.rdskrdsk rdijanisics, a prominent businessman, said that a temporary suspension is a "morelogical solution than working and eventually going bust". rdijanisicsics explained that a temporary suspension will help to reduce downturns in the labour market, as energy prices continue to mount. His comment was met with mixed response as the government has been attempting to ease the economic crisis.
