Belkacem Sadi LSP2431549

Task 2 — Semantic Search + RAG (Transformer + Vector DB) on LRT News

-1 Data collection: ≥ 5,000 unique articles from LRT EN (sitemaps + listing fallback), saved to CSV with checkpoints and dedup by `url`.

-2 Preprocessing: `title + content` → split into ~900-char chunks with 200-char overlap (improves retrieval).

-3 Feature extractor (Transformer): `intfloat/multilingual-e5-base` (handles LT/EN).  
  Training convention respected: `passage:` prefix for documents, `query:` for questions.  
  Embeddings L2-normalized → inner product == cosine.

-4 Vector database: FAISS `IndexFlatIP`; index persisted to disk and reload-tested.

-5 Semantic search: Encode query → FAISS top-k passages with scores + URLs.

-6 RAG answerer: Retrieve top-k → FLAN-T5 (local) generates concise answers with [1], [2], … citations. Optional OpenAI fallback if `OPENAI_API_KEY` is provided.

-7 Assessment harness: One function (`assess(query, k)`).


In [4]:
!pip -q install requests beautifulsoup4 pandas tqdm sentence-transformers faiss-cpu transformers gradio tiktoken


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os, re, time, uuid, gzip, io, random, concurrent.futures
from urllib.parse import urljoin
from xml.etree import ElementTree as ET
import requests
import pandas as pd
from bs4 import BeautifulSoup

# CONFIG
DATA_DIR = "articles_task2"
ART_DIR = "artifacts"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(ART_DIR, exist_ok=True)


TARGET_ARTICLES = 10000
CHECKPOINT_EVERY = 250
MAX_WORKERS = 8
REQUEST_TIMEOUT = 20
REQUEST_DELAY_SEC = 0.10
HEADERS = {"User-Agent": "AcademicProjectBot/1.0 (+mailto:you@example.com)"}

OUTPUT_CSV = os.path.join(DATA_DIR, "lrt_articles.csv")
print("Scraper output CSV ->", OUTPUT_CSV)


Scraper output CSV -> articles_task2/lrt_articles.csv


In [6]:
def _get(url, allow_redirects=True):
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=allow_redirects)
        if r.status_code == 200:
            return r
    except Exception:
        return None
    return None

def _clean(s):
    return re.sub(r"\s+", " ", (s or "")).strip()

#  SITEMAPS
def discover_sitemaps():
    robots = "https://www.lrt.lt/robots.txt"
    r = _get(robots)
    if not r:
        return []
    sm = []
    for line in r.text.splitlines():
        if line.lower().startswith("sitemap:"):
            sm.append(line.split(":", 1)[1].strip())
    return sm

def parse_sitemap_urls(sitemap_url):
    r = _get(sitemap_url)
    if not r:
        return [], []
    data = r.content
    if sitemap_url.endswith(".gz"):
        try:
            data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
        except Exception:
            pass
    try:
        root = ET.fromstring(data)
    except Exception:
        return [], []
    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    urls, children = [], []
    if root.tag.endswith("sitemapindex"):
        for sm_el in root.findall(".//sm:sitemap", ns):
            loc = sm_el.findtext("sm:loc", default="", namespaces=ns).strip()
            if loc:
                children.append(loc)
    elif root.tag.endswith("urlset"):
        for u in root.findall(".//sm:url", ns):
            loc = u.findtext("sm:loc", default="", namespaces=ns).strip()
            if loc:
                urls.append(loc)
    return urls, children

def collect_lrt_urls_from_sitemaps(max_urls=150_000):
    roots = discover_sitemaps()
    if not roots:
        return []

    seen_sm, seen_url = set(), set()
    queue = list(roots)
    found = []
    pat = re.compile(r"^https?://www\.lrt\.lt/en/news-in-english/\d+/")

    while queue and len(found) < max_urls:
        sm = queue.pop(0)
        if sm in seen_sm:
            continue
        seen_sm.add(sm)
        url_entries, children = parse_sitemap_urls(sm)
        for ch in children:
            if ch not in seen_sm:
                queue.append(ch)
        for u in url_entries:
            if u in seen_url:
                continue
            if pat.search(u):
                seen_url.add(u)
                found.append(u)
        time.sleep(0.02)
    return found

# LISTING FALLBACK
def discover_lrt_listing_urls(max_pages=8000):
    base = "https://www.lrt.lt"
    listing = "/en/news-in-english"
    pat = re.compile(r"^/en/news-in-english/\d+")
    urls = set()
    empty = 0
    for page in range(1, max_pages + 1):
        list_url = f"{base}{listing}?page={page}"
        r = _get(list_url)
        if not r:
            empty += 1
            if empty >= 3:
                break
            continue
        soup = BeautifulSoup(r.content, "html.parser")
        found = {urljoin(base, a["href"]) for a in soup.find_all("a", href=True) if pat.search(a["href"])}
        if not found:
            empty += 1
            if empty >= 3:
                break
        else:
            empty = 0
            urls.update(found)
        if page % 25 == 0:
            print(f"[listing] page {page}: total URLs so far = {len(urls)}")
        time.sleep(0.03)
    return list(urls)

# ARTICLE EXTRACTION
def extract_lrt_article(html):
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.find("h1")
    title = _clean(h1.get_text(" ", strip=True)) if h1 else None
    body = (soup.find("div", class_="content")
            or soup.find("article")
            or soup.find("div", {"itemprop": "articleBody"})
            or soup)
    parts = []
    for tag in body.find_all(["p", "li"]):
        txt = _clean(tag.get_text(" ", strip=True))
        if txt:
            parts.append(txt)
    content = "\n".join(parts)
    return title, content

def fetch_and_parse(url):
    r = _get(url)
    if not r:
        return None
    title, content = extract_lrt_article(r.text)
    if not (title and content):
        return None
    return {
        "id": str(uuid.uuid4()),
        "source": "lrt",
        "category": "news-in-english",
        "url": url,
        "title": title,
        "content": content
    }


In [7]:
# Run scraper

print("Collecting URLs from LRT sitemaps…")
urls = collect_lrt_urls_from_sitemaps(max_urls=150_000)
print("Sitemap URLs (English):", len(urls))

if len(urls) < 2000:
    print("Sitemaps thin — discovering listing pages…")
    urls = discover_lrt_listing_urls(max_pages=9000)
    print("Listing URLs:", len(urls))

if not urls:
    raise RuntimeError("No URLs discovered. Check network or try again later.")

print("Sample:", urls[:5])

# Resume support
seen = set()
if os.path.exists(OUTPUT_CSV):
    prev = pd.read_csv(OUTPUT_CSV)
    prev.dropna(subset=["url"], inplace=True)
    prev.drop_duplicates(subset=["url"], inplace=True)
    seen = set(prev["url"].tolist())
    print(f"Resuming from CSV with {len(prev)} rows.")
else:
    prev = pd.DataFrame(columns=["id","source","category","url","title","content"])

def smoke_test(n=5):
    ok = 0
    for u in urls[:n]:
        row = fetch_and_parse(u)
        if row:
            ok += 1
    return ok

ok = smoke_test(5)
print(f"Smoke test parsed {ok}/5.")
if ok == 0:
    raise RuntimeError("Parser failed on sample URLs.")

rows = []
saved = len(prev)
BATCH_SIZE = 32
MAX_WORKERS = 8
random.shuffle(urls)
i = 0
start = time.time()

def process_batch(batch):
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(fetch_and_parse, u): u for u in batch}
        for fut in concurrent.futures.as_completed(futs):
            try:
                row = fut.result()
                if row:
                    out.append(row)
            except Exception:
                pass
    return out

total_urls = len(urls)
while i < total_urls and (saved + len(rows)) < TARGET_ARTICLES:
    batch = []
    while i < total_urls and len(batch) < BATCH_SIZE:
        u = urls[i]; i += 1
        if u in seen:
            continue
        batch.append(u)
    if not batch:
        break
    got = process_batch(batch)
    rows.extend(got)
    print(f"+{len(got)} parsed | progress {saved + len(rows)}/{TARGET_ARTICLES}")
    time.sleep(REQUEST_DELAY_SEC)
    # checkpoint
    if (saved + len(rows)) // CHECKPOINT_EVERY > saved // CHECKPOINT_EVERY:
        out_df = pd.concat([prev, pd.DataFrame(rows)], ignore_index=True)
        out_df.drop_duplicates(subset=["url"], inplace=True)
        out_df.to_csv(OUTPUT_CSV, index=False)
        saved = len(out_df)
        print(f"Checkpoint saved: {saved} -> {OUTPUT_CSV}")

# Final save
out_df = pd.concat([prev, pd.DataFrame(rows)], ignore_index=True)
out_df.drop_duplicates(subset=["url"], inplace=True)
out_df.to_csv(OUTPUT_CSV, index=False)

elapsed = (time.time() - start) / 60.0
print("\n=== DONE SCRAPING ===")
print("Collected (articles):", len(out_df))
print("Saved:", OUTPUT_CSV)
print(f"Elapsed: {elapsed:.1f} min")

# Enforce >= 5000 articles
assert len(out_df) >= 5000, "Fewer than 5,000 articles collected. Re-run or extend listing pages to gather more."
print(" Requirement met: >= 5,000 articles.")


Collecting URLs from LRT sitemaps…
Sitemap URLs (English): 17774
Sample: ['https://www.lrt.lt/en/news-in-english/19/2550836/drawing-inspiration-from-ancestors-amber-catchers-on-lithuania-s-baltic-coast', 'https://www.lrt.lt/en/news-in-english/19/2551794/us-and-europe-s-diverging-views-on-how-to-end-russia-s-war-in-ukraine', 'https://www.lrt.lt/en/news-in-english/19/2551835/lithuania-s-eurovision-hopeful-katarsis-brings-raw-emotion-and-authenticity-to-the-stage', 'https://www.lrt.lt/en/news-in-english/19/2552102/lithuania-isn-t-nato-s-weak-spot-president-tells-german-daily', 'https://www.lrt.lt/en/news-in-english/19/2552075/lithuania-formally-requests-eu-fiscal-rules-exemption-for-military-spending']
Smoke test parsed 5/5.
+32 parsed | progress 32/10000
+32 parsed | progress 64/10000
+32 parsed | progress 96/10000
+32 parsed | progress 128/10000
+32 parsed | progress 160/10000
+32 parsed | progress 192/10000
+32 parsed | progress 224/10000
+32 parsed | progress 256/10000
Checkpoint save

In [1]:
!unzip -o "lrt_articles.zip" -d "content"

Archive:  lrt_articles.zip
  inflating: content/lrt_articles_copy.csv  


In [7]:
OUTPUT_CSV = "content/lrt_articles_copy.csv"

In [8]:
# Load/clean the dataset
df = pd.read_csv(OUTPUT_CSV)
df.dropna(subset=["title","content","url"], inplace=True)
df.drop_duplicates(subset=["url"], inplace=True)
print("Unique articles after clean:", len(df))
assert len(df) >= 5000, "Need >= 5000 unique articles after cleaning."
df.head(3)


Unique articles after clean: 5024


Unnamed: 0,id,source,category,url,title,content
0,ed47e64c-8ce4-467f-a1ad-342500fb7857,lrt,news-in-english,https://www.lrt.lt/en/news-in-english/19/12509...,Lithuania lifts self-isolation requirement for...,"As of Monday, people arriving in Lithuania fro..."
1,bd5616ce-459a-48f5-b978-704c772a5a79,lrt,news-in-english,https://www.lrt.lt/en/news-in-english/19/23292...,Lithuania cooperates with Ukraine on alleged s...,Lithuania is cooperating with Ukrainian office...
2,e41a8a38-8436-4816-85c0-f176d73cdd2a,lrt,news-in-english,https://www.lrt.lt/en/news-in-english/19/11302...,Lithuanian port 'ready' to assist Belarus in M...,Minsk has recently hinted at importing oil via...


In [9]:
# Chunk articles into passages

def chunk_text(t, size=900, overlap=200):
    t = (t or "").strip()
    if len(t) <= size:
        return [t] if t else []
    out, start = [], 0
    while start < len(t):
        end = start + size
        out.append(t[start:end])
        if end >= len(t):
            break
        start = end - overlap
    return out

docs = []
for _, r in df.iterrows():
    full = f"{r['title'].strip()}\n\n{(r['content'] or '').strip()}".strip()
    chunks = chunk_text(full, size=900, overlap=200)
    for i, ch in enumerate(chunks):
        docs.append({
            "doc_id": r["id"],
            "chunk_id": f"{r['id']}::{i}",
            "source": "lrt",
            "url": r["url"],
            "title": r["title"],
            "text": ch
        })

meta_df = pd.DataFrame(docs)
meta_df = meta_df[meta_df["text"].str.len() > 0]
meta_path = os.path.join(ART_DIR, "meta.csv")
meta_df.to_csv(meta_path, index=False)
print("Total entries (chunks):", len(meta_df))
print("Saved meta ->", meta_path)
meta_df.head(2)


Total entries (chunks): 19672
Saved meta -> artifacts/meta.csv


Unnamed: 0,doc_id,chunk_id,source,url,title,text
0,ed47e64c-8ce4-467f-a1ad-342500fb7857,ed47e64c-8ce4-467f-a1ad-342500fb7857::0,lrt,https://www.lrt.lt/en/news-in-english/19/12509...,Lithuania lifts self-isolation requirement for...,Lithuania lifts self-isolation requirement for...
1,ed47e64c-8ce4-467f-a1ad-342500fb7857,ed47e64c-8ce4-467f-a1ad-342500fb7857::1,lrt,https://www.lrt.lt/en/news-in-english/19/12509...,Lithuania lifts self-isolation requirement for...,than Lithuania over the past 14 days will hav...


In [10]:
# Embeddings with intfloat/multilingual-e5-base
!pip -q install sentence-transformers torch --upgrade

from sentence_transformers import SentenceTransformer
import numpy as np, torch
from tqdm import trange

EMBED_MODEL = "intfloat/multilingual-e5-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer(EMBED_MODEL, device=device)

texts = ["passage: " + t for t in meta_df["text"].tolist()]
BATCH = 64

emb_chunks = []
for i in trange(0, len(texts), BATCH):
    X = embedder.encode(texts[i:i+BATCH], normalize_embeddings=True, show_progress_bar=False)
    emb_chunks.append(X.astype("float32"))
emb = np.vstack(emb_chunks)

emb_path = os.path.join(ART_DIR, "embeddings.npy")
np.save(emb_path, emb)
print("Embeddings shape:", emb.shape)
print("Saved ->", emb_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

100%|██████████| 308/308 [04:21<00:00,  1.18it/s]

Embeddings shape: (19672, 768)
Saved -> artifacts/embeddings.npy





In [11]:
# Vector DB (FAISS) with cosine (via normalized vectors + inner product)
!pip -q install faiss-cpu

import faiss, numpy as np

dim = emb.shape[1]
faiss_index = faiss.IndexFlatIP(dim)
faiss_index.add(emb)
faiss_path = os.path.join(ART_DIR, "index.faiss")
faiss.write_index(faiss_index, faiss_path)
print("FAISS ntotal:", faiss_index.ntotal)
print("Saved index ->", faiss_path)

# Reload test
faiss_index = faiss.read_index(faiss_path)
assert faiss_index.ntotal == emb.shape[0]
print("✔ FAISS reload OK.")


FAISS ntotal: 19672
Saved index -> artifacts/index.faiss
✔ FAISS reload OK.


In [12]:
# Semantic search
import pandas as pd, numpy as np, faiss

meta = pd.read_csv(os.path.join(ART_DIR, "meta.csv"))

def semantic_search(query: str, k: int = 5) -> pd.DataFrame:
    qv = embedder.encode(["query: " + query], normalize_embeddings=True).astype("float32")
    D, I = faiss_index.search(qv, k)
    out = meta.iloc[I[0]].copy()
    out["score"] = D[0]
    return out[["score","title","url","text"]]

# Smoke test
display(semantic_search("Energy prices in Lithuania", k=5))


Unnamed: 0,score,title,url,text
19620,0.878528,"Lithuanian government approves electricity, ga...",https://www.lrt.lt/en/news-in-english/19/18445...,"Lithuanian government approves electricity, ga..."
4706,0.874859,Gas shortages in Europe to affect prices in Li...,https://www.lrt.lt/en/news-in-english/19/17417...,"eam 1 pipeline, as its 10-day scheduled mainte..."
17716,0.873602,Lithuania is paying 100 times more for electri...,https://www.lrt.lt/en/news-in-english/19/17551...,"n Sweden, electricity prices are almost 100 ti..."
17715,0.869892,Lithuania is paying 100 times more for electri...,https://www.lrt.lt/en/news-in-english/19/17551...,Lithuania is paying 100 times more for electri...
17813,0.868918,Natural gas prices for households in Lithuania...,https://www.lrt.lt/en/news-in-english/19/17076...,Natural gas prices for households in Lithuania...


In [13]:
# RAG answerer
!pip -q install transformers accelerate

from transformers import pipeline
import os

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device=0 if torch.cuda.is_available() else -1
)

def rag_answer(query: str, top_k: int = 5, max_new_tokens: int = 256, use_openai: bool = False):
    ctx = semantic_search(query, k=top_k)
    context = "\n\n".join([f"[{i+1}] {row.text}" for i, row in ctx.iterrows()])
    citations = "\n".join([f"[{i+1}] {row.url}" for i, row in ctx.iterrows()])

    answer = None
    if use_openai and os.getenv("OPENAI_API_KEY"):
        try:
            from openai import OpenAI
            client = OpenAI()
            sys = "Answer concisely using ONLY the provided context. Include citations like [1], [2]."
            usr = f"Context:\n{context}\n\nQuestion: {query}"
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role":"system","content":sys},{"role":"user","content":usr}],
                temperature=0.2, max_tokens=max_new_tokens
            )
            answer = resp.choices[0].message.content
        except Exception as e:
            print("OpenAI failed, falling back to FLAN:", e)

    if answer is None:
        prompt = f"Use ONLY the context and add bracketed citations like [1], [2].\n\nContext:\n{context}\n\nQuestion: {query}"
        out = generator(prompt, max_new_tokens=max_new_tokens, do_sample=False)
        answer = out[0]["generated_text"]

    return {"answer": answer, "citations": citations, "contexts": ctx}

# Smoke test
res = rag_answer("What is the latest information for sport in Lithuania ?", top_k=5)
print(res["answer"])
print("\nCITATIONS:\n", res["citations"])


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 512). Running this sequence through the model will result in indexing errors


[3375] aunas next summer either. The Lithuanian national team is scheduled to play FIFA international matches on June 6â7 and June 9â10 next year, coinciding with Timberlakeâs concert. âWe will celebrate again, we will dance. We already knew last week that the stadium operator was planning this concert. They knew that the FIFA window was scheduled at the same time, but it is what it is. I will not say anything new. Football is in the background again,â LFF President Edgaras Stankeviius told the 15min.lt news website. At the beginning of June next year, the Lithuanian national team will be playing qualifying matches for the 2026 World Cup. Lithuaniaâs opponents will be revealed after the draw ceremony on December 13. âBefore the tournament, each federation will receive a special FIFA form, where we will mark when we can play home matches. Now it will be very simple â we will ma [15884] in an objective and impartial investigation,â Grigas was quoted in a post on the LKFâs Facebook page o

In [16]:
!cp "articles_task2/lrt_articles.csv" "articles_task2/lrt_articles_copy.csv"


In [18]:
!zip -j "lrt_articles.zip" "articles_task2/lrt_articles_copy.csv"
#!unzip -o "lrt_articles.zip" -d "content"


  adding: lrt_articles_copy.csv (deflated 64%)


In [14]:

# Assessment harness
def assess(query: str, k: int = 5, use_openai: bool = False):
    out = rag_answer(query, top_k=k, use_openai=use_openai)
    display(out["contexts"][["score","title","url","text"]])
    print("\nANSWER:\n", out["answer"])
    print("\nCITATIONS:\n", out["citations"])


assess("Summarize LRT coverage on energy prices in Lithuania this year.", k=5, use_openai=False)


Unnamed: 0,score,title,url,text
147,0.884535,LRT English Newsletter: Turn off the lights to...,https://www.lrt.lt/en/news-in-english/19/17709...,LRT English Newsletter: Turn off the lights to...
649,0.878084,LRT English Newsletter: Return of nightlife,https://www.lrt.lt/en/news-in-english/19/14345...,"tion on the contract, LRT Investigation Team r..."
19245,0.869627,LRT English Newsletter: Kaunas modernism goes ...,https://www.lrt.lt/en/news-in-english/19/20824...,e repeated interest rate hikes by the European...
6797,0.868052,Lithuanian government to spend up to â¬1 bill...,https://www.lrt.lt/en/news-in-english/19/17648...,Lithuanian government to spend up to â¬1 bill...
410,0.867273,Lithuanian cabinet okays legislation on power ...,https://www.lrt.lt/en/news-in-english/19/18372...,Lithuanian cabinet okays legislation on power ...



ANSWER:
 Energy prices continue to rival Russiaâs war in Ukraine as the number-one concern debated in Lithuania. Inflation hit 21.1 percent in August, according to provisional estimates, and Vilnius authorities are dimming street lighting to economise on electricity. Saving energy will be a must over the coming winter â and government institutions will lead the way. The pointedly-titled campaign seems to have attracted the attention of cyber warriors â the following day after announcing it, Lithuaniaâs state-owned property manager was subjected to a cyber attack that temp [650] tion on the contract, LRT Investigation Team reports. â Every fifth Lithuanian who visited a healthcare institution said they offered a bribe, one of the biggest shares in the EU, according to a report by Transparency International. â As Lithuaniaâs economy is bouncing back from the pandemic slowdown, so are prices, with the country reporting one of the highest inflation rates in the eurozone.

CITATIONS:
 [148