In [None]:
!pip -q install beautifulsoup4 lxml readability-lxml tldextract pdfplumber python-dateutil rank-bm25
import os, json, pathlib
root = pathlib.Path("/content/rag-pgh")
for p in ["data/raw", "data/corpus", "data/test", "data/train", "scripts"]:
    (root/p).mkdir(parents=True, exist_ok=True)

print("Project root:", root)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m139.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hProject root: /content/rag-pgh


In [None]:
seeds = """
https://en.wikipedia.org/wiki/Pittsburgh
https://en.wikipedia.org/wiki/History_of_Pittsburgh
https://www.visitpittsburgh.com/
https://pittsburghpa.gov/
https://pittsburghpa.gov/finance/tax-guides
https://pittsburghpa.gov/omb/operating-budget
https://www.cmu.edu/
https://www.cmu.edu/events/
https://pittsburghsymphony.org/
https://trustarts.org/
https://www.heinzhistorycenter.org/
https://carnegiemuseums.org/
https://www.thefrickpittsburgh.org/
https://downtownpittsburgh.com/events/
https://www.pittsburghcitypaper.com/pittsburgh/Events
https://www.picklesburgh.com/
https://www.pghrestaurantweek.com/
https://www.pghtacofest.com/
https://littleitalydays.com/
https://www.mlb.com/pirates
https://www.steelers.com/
https://www.nhl.com/penguins
""".strip()

(root/"seeds.txt").write_text(seeds, encoding="utf-8")
print((root/"seeds.txt").read_text()[:500])


https://en.wikipedia.org/wiki/Pittsburgh
https://en.wikipedia.org/wiki/History_of_Pittsburgh
https://www.visitpittsburgh.com/
https://pittsburghpa.gov/
https://pittsburghpa.gov/finance/tax-guides
https://pittsburghpa.gov/omb/operating-budget
https://www.cmu.edu/
https://www.cmu.edu/events/
https://pittsburghsymphony.org/
https://trustarts.org/
https://www.heinzhistorycenter.org/
https://carnegiemuseums.org/
https://www.thefrickpittsburgh.org/
https://downtownpittsburgh.com/events/
https://www.pi


In [None]:
import time, re, hashlib, queue, urllib.parse, urllib.robotparser
import requests
import tldextract
from bs4 import BeautifulSoup

RAW_DIR = root/"data/raw"
URLMAP_PATH = root/"data/raw/url_map.json"
USER_AGENT = "pgh-rag-bot/0.1 (coursework; contact: you@example.com)"
TIMEOUT = 15
SLEEP = 1.0
MAX_PAGES = 800

def normalize_url(u: str) -> str:
    u = urllib.parse.urldefrag(u)[0]
    p = urllib.parse.urlparse(u)
    qs = urllib.parse.parse_qsl(p.query, keep_blank_values=False)
    qs = [(k,v) for (k,v) in qs if not k.lower().startswith(("utm_", "fbclid", "gclid"))]
    new_q = urllib.parse.urlencode(qs)
    return urllib.parse.urlunparse(p._replace(query=new_q))

def same_registered_domain(u: str, allowed_domains: list[str]) -> bool:
    host = urllib.parse.urlparse(u).netloc
    reg = tldextract.extract(host).registered_domain
    return any(reg == d or host.endswith("."+d) for d in allowed_domains)

def load_seeds(path):
    s = pathlib.Path(path).read_text().splitlines()
    seeds = [normalize_url(x.strip()) for x in s if x.strip()]
    allowed = set()
    for u in seeds:
        host = urllib.parse.urlparse(u).netloc
        allowed.add(tldextract.extract(host).registered_domain)
    return seeds, sorted(allowed)

def is_pdf(url: str) -> bool:
    return url.lower().split("?")[0].endswith(".pdf")

def safe_name(url: str) -> str:
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:16]
    return f"{h}.pdf" if is_pdf(url) else f"{h}.html"

def fetch(url: str):
    headers = {"User-Agent": USER_AGENT}
    r = requests.get(url, headers=headers, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def extract_links(html: str, base_url: str) -> set[str]:
    soup = BeautifulSoup(html, "lxml")
    out = set()
    for a in soup.find_all("a", href=True):
        href = urllib.parse.urljoin(base_url, a["href"])
        if href.startswith(("mailto:", "javascript:")):
            continue
        out.add(normalize_url(href))
    return out

def crawl():
    seeds, allowed = load_seeds(root/"seeds.txt")
    print("Allowed domains:", allowed)

    robot_cache = {}
    def allowed_by_robots(url: str) -> bool:
        parsed = urllib.parse.urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}"
        if base not in robot_cache:
            rp = urllib.robotparser.RobotFileParser()
            try:
                rp.set_url(urllib.parse.urljoin(base, "/robots.txt"))
                rp.read()
            except Exception:
                pass
            robot_cache[base] = rp
        try:
            return robot_cache[base].can_fetch(USER_AGENT, url)
        except Exception:
            return True

    seen, url_map = set(), {}
    q = queue.Queue()
    for s in seeds:
        q.put(s); seen.add(s)

    n_saved = 0
    while not q.empty() and n_saved < MAX_PAGES:
        url = q.get()
        if not same_registered_domain(url, allowed):
            continue
        if not allowed_by_robots(url):
            continue
        try:
            time.sleep(SLEEP)
            resp = fetch(url)
        except Exception:
            continue

        fname = RAW_DIR/safe_name(url)
        try:
            with open(fname, "wb") as f:
                f.write(resp.content)
            url_map[fname.name] = url
            n_saved += 1
            print(f"[{n_saved}] {url} -> {fname.name}")
        except Exception:
            continue

        if (not is_pdf(url)) and "text/html" in resp.headers.get("Content-Type",""):
            try:
                for l in extract_links(resp.text, url):
                    if l not in seen and same_registered_domain(l, allowed):
                        seen.add(l); q.put(l)
            except Exception:
                pass

    (RAW_DIR/"url_map.json").write_text(json.dumps(url_map, indent=2), encoding="utf-8")
    print(f"Done. Saved {n_saved} files. URL map at {URLMAP_PATH}")

crawl()


  allowed.add(tldextract.extract(host).registered_domain)
  reg = tldextract.extract(host).registered_domain


Allowed domains: ['carnegiemuseums.org', 'cmu.edu', 'downtownpittsburgh.com', 'heinzhistorycenter.org', 'littleitalydays.com', 'mlb.com', 'nhl.com', 'pghrestaurantweek.com', 'pghtacofest.com', 'picklesburgh.com', 'pittsburghcitypaper.com', 'pittsburghpa.gov', 'pittsburghsymphony.org', 'steelers.com', 'thefrickpittsburgh.org', 'trustarts.org', 'visitpittsburgh.com', 'wikipedia.org']
[1] https://www.visitpittsburgh.com/ -> 2b253825728e9152.html
[2] https://pittsburghpa.gov/ -> b463e569cbb6d2f1.html
[3] https://www.cmu.edu/ -> b993bc448dbf10fd.html
[4] https://www.cmu.edu/events/ -> dc8eb38b8517024e.html
[5] https://www.thefrickpittsburgh.org/ -> b63c572668c8a830.html
[6] https://downtownpittsburgh.com/events/ -> b504b92ecbe57939.html
[7] https://www.pghtacofest.com/ -> c4f3c615c4d4aed4.html
[8] https://www.mlb.com/pirates -> f7935da9f95dd5cb.html
[9] https://www.steelers.com/ -> b7e85f22b943fed1.html
[10] https://www.visitpittsburgh.com/things-to-do/family-fun/ -> 38e0e4551bebfe27.html
[

In [None]:
import pdfplumber, re, hashlib
from bs4 import BeautifulSoup
from readability import Document

CORPUS_PATH = root/"data/corpus/documents.jsonl"


def sha1(s): return hashlib.sha1(s.encode("utf-8")).hexdigest()
def norm_text(t): return re.sub(r"\s+", " ", t.lower()).strip()

def html_to_text(html: str):
    try:
        doc = Document(html)
        title = (doc.short_title() or "").strip()
        soup = BeautifulSoup(doc.summary(html_partial=True), "lxml")
        text = soup.get_text("\n", strip=True)
    except Exception:
        soup = BeautifulSoup(html, "lxml")
        title = (soup.title.string if soup.title else "").strip()
        text = soup.get_text("\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return title, text

def pdf_to_text(path):
    pages = []
    with pdfplumber.open(path) as pdf:
        for p in pdf.pages:
            t = p.extract_text() or ""
            pages.append(t.strip())
    text = "\n\n".join(pages)
    text = re.sub(r"\n{3,}", "\n\n", text)
    title = pathlib.Path(path).name
    return title, text

def extract_corpus():
    url_map = json.loads((RAW_DIR/"url_map.json").read_text()) if (RAW_DIR/"url_map.json").exists() else {}
    seen_hashes = set()
    n_docs = 0

    with open(CORPUS_PATH, "w", encoding="utf-8") as out:
        for p in RAW_DIR.iterdir():
            if not p.is_file(): continue
            if p.name == "url_map.json": continue
            try:
                if p.suffix.lower() == ".pdf":
                    title, text = pdf_to_text(p)
                    source_type = "pdf"
                else:
                    html = p.read_text("utf-8", errors="ignore")
                    title, text = html_to_text(html)
                    source_type = "html"
                if not text or len(text) < 200:
                    continue
                # exact dedup
                h = sha1(norm_text(text))
                if h in seen_hashes:
                    continue
                seen_hashes.add(h)

                row = {
                    "doc_id": p.stem,
                    "url": url_map.get(p.name, ""),
                    "title": title,
                    "source_type": source_type,
                    "char_count": len(text),
                    "text": text
                }
                out.write(json.dumps(row, ensure_ascii=False) + "\n")
                n_docs += 1
            except Exception:
                pass
    print(f"Wrote {n_docs} docs -> {CORPUS_PATH}")

extract_corpus()


Wrote 580 docs -> /content/rag-pgh/data/corpus/documents.jsonl


In [None]:
import re, uuid, json, hashlib
from pathlib import Path


# project root in Colab
root = Path("/content/rag-pgh")

# input corpus (scraped data) and output chunks
CORPUS_PATH = root / "data/corpus/documents.jsonl"
CHUNKS_PATH = root / "data/corpus/chunks.jsonl"

# Tunables
MAX_CHARS   = 1600      # hard cap per chunk
TARGET_CHARS= 1400      # aim here, try to cut near a boundary
MIN_CHARS   = 300       # avoid tiny chunks; merge/extend if below this
OVERLAP     = 200       # fixed backward overlap in characters
MIN_BOUNDARY_HEADROOM = 200  # try to find a boundary within last 200 chars

# Regexes
SENT_END_RE   = re.compile(r'(?<=[.!?])\s')        # sentence-like boundary
PARA_SPLIT_RE = re.compile(r'\n\s*\n')             # paragraph boundary
LIST_LINE_RE  = re.compile(r'^\s*([-*•]|[0-9]+\.)\s+', re.M)

def _hash_text(s: str) -> str:
    return hashlib.md5(s.encode('utf-8')).hexdigest()

def _find_natural_cut(text: str, start: int, max_chars: int, prefer_window: int) -> int:
    """
    Return an end index in [start+MIN_CHARS, start+max_chars], preferring
    paragraph/list/sentence boundaries close to start+TARGET_CHARS.
    """
    n = len(text)
    hard_end = min(n, start + max_chars)
    target_end = min(n, start + TARGET_CHARS)

    # 1) try paragraph boundary near target
    # search backward from [target_end .. target_end - prefer_window]
    para_back = text.rfind("\n\n", max(start, target_end - prefer_window), target_end)
    if para_back != -1 and para_back - start >= MIN_CHARS:
        return para_back

    # 2) try list-line boundary near target (start of a list item)
    # search forward to end of window for a blank line, then a list bullet
    m = None
    for m in LIST_LINE_RE.finditer(text, start, hard_end):
        if m.start() - start >= MIN_CHARS and m.start() <= target_end:
            # pick the last acceptable match <= target_end
            pass
    if m and m.start() - start >= MIN_CHARS:
        return m.start()

    # 3) try sentence boundary near target (backward search)
    window_start = max(start + MIN_CHARS, target_end - prefer_window)
    window_start = min(window_start, hard_end)
    last_sent = None
    for m in SENT_END_RE.finditer(text, window_start, min(hard_end, target_end + 1)):
        last_sent = m
    if last_sent and (last_sent.end() + start) - start >= MIN_CHARS:
        # m.end() is relative to text, already absolute
        return last_sent.end()

    # 4) fall back: cut at hard_end, but ensure we meet MIN_CHARS if possible
    if hard_end - start >= MIN_CHARS:
        return hard_end

    # 5) if we still can't meet MIN_CHARS (near document end), push to doc end
    return n

def split_into_chunks(text: str,
                      max_chars=MAX_CHARS,
                      overlap=OVERLAP,
                      min_chars=MIN_CHARS,
                      prefer_window=MIN_BOUNDARY_HEADROOM):
    """
    Boundary-aware chunking over original text indices.
    Guarantees: fixed overlap (except first chunk), no 1-char slides, no tiny shards.
    """
    chunks = []
    n = len(text)
    start = 0

    # pre-trim obvious boilerplate-only tails like "last updated: ..." (optional)
    # (No removal here to preserve indices; you can store/update this as metadata instead.)

    while start < n:
        if start > 0:
            # back up by overlap from the previous end (fixed step)
            start = max(0, chunks[-1][1] - overlap)

        end = _find_natural_cut(text, start, max_chars, prefer_window)
        end = max(end, start + min_chars)  # enforce min size if boundary logic failed
        end = min(end, n)

        # If we're stuck (no progress), break to avoid infinite loop
        if chunks and end <= chunks[-1][1]:
            end = min(n, start + max_chars)

        chunk = text[start:end]
        chunk = chunk.strip()

        # Skip empty
        if not chunk:
            # advance safely to avoid 1-char slide
            start = min(n, start + max(min_chars, 1))
            continue

        chunks.append((start, end, chunk))

        if end >= n:
            break

        # Move to next window's nominal start (will be adjusted at top of loop)
        start = end

    # Merge a too-small tail into previous chunk
    if len(chunks) >= 2 and len(chunks[-1][2]) < min_chars:
        prev_start, prev_end, prev_text = chunks[-2]
        last_start, last_end, last_text = chunks[-1]
        merged = (prev_start, last_end, (prev_text + "\n\n" + last_text).strip())
        chunks[-2] = merged
        chunks.pop()

    # De-duplicate (exact) within a doc
    seen = set()
    deduped = []
    for a, b, ch in chunks:
        h = _hash_text(ch)
        if h in seen:
            continue
        seen.add(h)
        deduped.append((a, b, ch))

    return deduped

def build_chunks():
    n_chunks = 0
    with open(CORPUS_PATH, "r", encoding="utf-8") as inp, open(CHUNKS_PATH, "w", encoding="utf-8") as out:
        for line in inp:
            doc = json.loads(line)
            text = doc["text"]
            doc_id = doc["doc_id"]
            url    = doc.get("url", "")
            title  = doc.get("title", "")

            # Split
            parts = split_into_chunks(text)

            # Write
            for (a, b, ch) in parts:
                row = {
                    "chunk_id": str(uuid.uuid4()),
                    "doc_id": doc_id,
                    "url": url,
                    "title": title,
                    "start_char": a,
                    "end_char": b,
                    "text": ch
                }
                out.write(json.dumps(row, ensure_ascii=False) + "\n")
                n_chunks += 1

    print(f"Wrote {n_chunks} chunks -> {CHUNKS_PATH}")

if __name__ == "__main__":
    build_chunks()


Wrote 2445 chunks -> /content/rag-pgh/data/corpus/chunks.jsonl


In [None]:
import json, statistics, math
from pathlib import Path

CHUNKS_PATH = Path("/content/rag-pgh/data/corpus/chunks.jsonl")

lengths = []
n = 0
examples = []

with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        row = json.loads(line)
        L = len(row["text"])
        lengths.append(L)
        n += 1
        if i < 3:
            examples.append((row["title"], row["url"], L, row["text"][:300].replace("\n"," ")+"..."))

print(f"Total chunks: {n}")
print("len(text): min/median/mean/p95/max =",
      min(lengths), statistics.median(lengths), round(statistics.mean(lengths),1),
      sorted(lengths)[int(0.95*len(lengths))-1], max(lengths))
print("\nSamples:")
for t,u,L,txt in examples:
    print("-", t, f"({L} chars)")
    print("  ", u)
    print("  ", txt)


Total chunks: 2445
len(text): min/median/mean/p95/max = 201 1320 1193.5 1600 1872

Samples:
- Park Maintenance (1600 chars)
   https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Park-Maintenance
   Park Maintenance The Department of Public Works is responsible for the daily maintenance and repair of Pittsburgh's park system. Around 100 laborers within our six Parks Divisions care for 163 public parks. We work in cooperation with CitiParks, the Department of Public Works' Forestry Division, and...
- Park Maintenance (1600 chars)
   https://www.pittsburghpa.gov/Recreation-Events/About-Parks/Park-Maintenance
   nd Division 151 Lake Drive Pittsburgh, PA 15206 412-665-3632 Tara Pinnix , Parks Maintenance Manager Jevon Broughton , Foreman Arsenal Park Chadwick Park Dinan Park Duncan Park East Liberty Blvd Strip Enright Park Fifty-Seventh Street Park Fort Pitt Park Friendship Park Garland Park Heth's Park High...
- Park Maintenance (1600 chars)
   https://www.pittsburghpa.gov/Recreati

In [None]:
import re, json
from pathlib import Path

IN_CHUNKS  = Path("/content/rag-pgh/data/corpus/chunks.jsonl")
OUT_CLEAN  = Path("/content/rag-pgh/data/corpus/chunks.clean.jsonl")

DROP_URL_PATTERNS = (
    r"/login", r"/signin", r"/sign-in", r"/account", r"/cart", r"/checkout",
    r"/privacy", r"/terms", r"/cookies", r"/language", r"/accessibility"
)
drop_url_rx = re.compile("|".join(DROP_URL_PATTERNS), re.I)

BAD_TITLE_WORDS = re.compile(r"\b(Login|Sign\s*In|Register|Cookie|Privacy|Terms)\b", re.I)

def is_mostly_numeric(text, threshold=0.35):
    if not text: return True
    digits = sum(c.isdigit() for c in text)
    return (digits / max(1, len(text))) > threshold

def low_alpha_ratio(text, threshold=0.55):
    alpha = sum(c.isalpha() for c in text)
    return (alpha / max(1, len(text))) < threshold

def looks_like_nav(text):
    # long bullet/short line lists with little prose
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines: return True
    short = sum(1 for ln in lines if len(ln) <= 25)
    return len(lines) >= 15 and (short / len(lines)) >= 0.6

kept = 0
skipped = 0
with open(IN_CHUNKS, "r", encoding="utf-8") as inp, open(OUT_CLEAN, "w", encoding="utf-8") as out:
    for line in inp:
        row = json.loads(line)
        url   = row.get("url","") or ""
        title = row.get("title","") or ""
        text  = row.get("text","") or ""

        # URL/title based filters
        if drop_url_rx.search(url) or BAD_TITLE_WORDS.search(title):
            skipped += 1
            continue

        # Content-based filters (conservative)
        if is_mostly_numeric(text) or low_alpha_ratio(text) or looks_like_nav(text):
            skipped += 1
            continue

        out.write(json.dumps(row, ensure_ascii=False) + "\n")
        kept += 1

print(f"Kept {kept} chunks, skipped {skipped}.")


Kept 2072 chunks, skipped 373.


In [None]:
!pip -q install dateparser
import json, re, dateparser
from pathlib import Path

IN_PATH  = Path("/content/rag-pgh/data/corpus/chunks.clean.jsonl")
OUT_PATH = Path("/content/rag-pgh/data/corpus/chunks.with_dates.jsonl")

DATE_WINDOW_START = "2025-10-09"
DATE_WINDOW_END   = "2025-12-31"

def to_iso(d):
    try:
        return d.strftime("%Y-%m-%d")
    except:
        return None

def in_window(iso):
    return iso and (DATE_WINDOW_START <= iso <= DATE_WINDOW_END)

date_like_rx = re.compile(
    r"\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|"
    r"Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|"
    r"Dec(?:ember)?)\s+\d{1,2},?\s+\d{4}\b"
    r"|"
    r"\b\d{1,2}/\d{1,2}/\d{2,4}\b"
    r"|"
    r"\b\d{4}\b"
, re.I)

kept, flagged = 0, 0
with open(IN_PATH, "r", encoding="utf-8") as inp, open(OUT_PATH, "w", encoding="utf-8") as out:
    for line in inp:
        row = json.loads(line)
        text = row["text"]
        hits = date_like_rx.findall(text)
        iso_dates = []
        for h in hits:
            dt = dateparser.parse(h, settings={"PREFER_DAY_OF_MONTH":"first"})
            if dt:
                iso = to_iso(dt)
                if iso: iso_dates.append(iso)
        row["dates_iso"] = sorted(set(iso_dates))
        row["has_event_in_window"] = any(in_window(d) for d in row["dates_iso"])
        out.write(json.dumps(row, ensure_ascii=False) + "\n")
        kept += 1
        if row["has_event_in_window"]:
            flagged += 1

print(f"Tagged {kept} chunks; {flagged} with dates in {DATE_WINDOW_START}..{DATE_WINDOW_END}.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/315.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hTagged 2072 chunks; 194 with dates in 2025-10-09..2025-12-31.


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from pathlib import Path
DRIVE_ROOT = Path('/content/drive/MyDrive/rag-pgh')
DRIVE_ROOT.mkdir(parents=True, exist_ok=True)
DRIVE_ROOT


In [None]:
!rsync -av --progress /content/rag-pgh/ /content/drive/MyDrive/rag-pgh/


RAG system

In [None]:
#!/usr/bin/env python3
"""
RAG (Retrieval Augmented Generation) System for Pittsburgh/CMU Q&A
"""

import json
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Tuple
import faiss
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


class RAGSystem:
    def __init__(self, corpus_path: str, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize RAG system with document corpus

        Args:
            corpus_path: Path to JSONL file containing document chunks
            model_name: Name of sentence transformer model to use
        """
        self.corpus_path = Path(corpus_path)
        self.model_name = model_name
        self.chunks = []
        self.embeddings = None
        self.index = None
        self.bm25 = None
        self.tfidf = None
        self.tfidf_matrix = None

        # Load sentence transformer model
        print(f"Loading sentence transformer model: {model_name}")
        self.embedder = SentenceTransformer(model_name)

        # Load and process corpus
        self._load_corpus()
        self._build_indices()

    def _load_corpus(self):
        """Load document chunks from JSONL file"""
        print(f"Loading corpus from {self.corpus_path}")
        with open(self.corpus_path, 'r', encoding='utf-8') as f:
            for line in f:
                chunk = json.loads(line.strip())
                self.chunks.append(chunk)
        print(f"Loaded {len(self.chunks)} document chunks")

    def _build_indices(self):
        """Build various search indices"""
        print("Building search indices...")

        # Extract texts for indexing
        texts = [chunk['text'] for chunk in self.chunks]

        # 1. Build FAISS index for dense embeddings
        print("Building FAISS index...")
        self.embeddings = self.embedder.encode(texts, show_progress_bar=True)
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        faiss.normalize_L2(self.embeddings)  # Normalize for cosine similarity
        self.index.add(self.embeddings)

        # 2. Build BM25 index for sparse retrieval
        print("Building BM25 index...")
        tokenized_texts = [self._tokenize(text) for text in texts]
        self.bm25 = BM25Okapi(tokenized_texts)

        # 3. Build TF-IDF index
        print("Building TF-IDF index...")
        self.tfidf = TfidfVectorizer(
            max_features=10000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2
        )
        self.tfidf_matrix = self.tfidf.fit_transform(texts)

        print("Indices built successfully!")

    def _tokenize(self, text: str) -> List[str]:
        """Simple tokenization for BM25"""
        return re.findall(r'\b\w+\b', text.lower())

    def retrieve_documents(self, query: str, k: int = 5, method: str = "hybrid") -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: Input question
            k: Number of documents to retrieve
            method: Retrieval method ("dense", "sparse", "tfidf", "hybrid")

        Returns:
            List of relevant document chunks with scores
        """
        if method == "dense":
            return self._dense_retrieval(query, k)
        elif method == "sparse":
            return self._sparse_retrieval(query, k)
        elif method == "tfidf":
            return self._tfidf_retrieval(query, k)
        elif method == "hybrid":
            return self._hybrid_retrieval(query, k)
        else:
            raise ValueError(f"Unknown retrieval method: {method}")

    def _dense_retrieval(self, query: str, k: int) -> List[Dict[str, Any]]:
        """Dense retrieval using sentence transformers"""
        query_embedding = self.embedder.encode([query])
        faiss.normalize_L2(query_embedding)

        scores, indices = self.index.search(query_embedding, k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks):
                chunk = self.chunks[idx].copy()
                chunk['score'] = float(score)
                chunk['method'] = 'dense'
                results.append(chunk)

        return results

    def _sparse_retrieval(self, query: str, k: int) -> List[Dict[str, Any]]:
        """Sparse retrieval using BM25"""
        tokenized_query = self._tokenize(query)
        scores = self.bm25.get_scores(tokenized_query)

        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:k]

        results = []
        for idx in top_indices:
            if scores[idx] > 0:  # Only include documents with non-zero scores
                chunk = self.chunks[idx].copy()
                chunk['score'] = float(scores[idx])
                chunk['method'] = 'sparse'
                results.append(chunk)

        return results

    def _tfidf_retrieval(self, query: str, k: int) -> List[Dict[str, Any]]:
        """TF-IDF retrieval"""
        query_vector = self.tfidf.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        # Get top-k indices
        top_indices = np.argsort(similarities)[::-1][:k]

        results = []
        for idx in top_indices:
            if similarities[idx] > 0:
                chunk = self.chunks[idx].copy()
                chunk['score'] = float(similarities[idx])
                chunk['method'] = 'tfidf'
                results.append(chunk)

        return results

    def _hybrid_retrieval(self, query: str, k: int) -> List[Dict[str, Any]]:
        """Hybrid retrieval combining dense and sparse methods"""
        # Get more results from each method
        dense_results = self._dense_retrieval(query, k * 2)
        sparse_results = self._sparse_retrieval(query, k * 2)

        # Normalize scores to [0, 1] range
        if dense_results:
            dense_scores = [r['score'] for r in dense_results]
            max_dense = max(dense_scores)
            min_dense = min(dense_scores)
            for r in dense_results:
                r['normalized_score'] = (r['score'] - min_dense) / (max_dense - min_dense) if max_dense > min_dense else 0.5

        if sparse_results:
            sparse_scores = [r['score'] for r in sparse_results]
            max_sparse = max(sparse_scores)
            min_sparse = min(sparse_scores)
            for r in sparse_results:
                r['normalized_score'] = (r['score'] - min_sparse) / (max_sparse - min_sparse) if max_sparse > min_sparse else 0.5

        # Combine results
        combined = {}
        for r in dense_results:
            chunk_id = r['chunk_id']
            if chunk_id not in combined:
                combined[chunk_id] = r.copy()
                combined[chunk_id]['hybrid_score'] = r['normalized_score'] * 0.6  # Weight dense more
            else:
                combined[chunk_id]['hybrid_score'] += r['normalized_score'] * 0.6

        for r in sparse_results:
            chunk_id = r['chunk_id']
            if chunk_id not in combined:
                combined[chunk_id] = r.copy()
                combined[chunk_id]['hybrid_score'] = r['normalized_score'] * 0.4  # Weight sparse less
            else:
                combined[chunk_id]['hybrid_score'] += r['normalized_score'] * 0.4

        # Sort by hybrid score and return top-k
        sorted_results = sorted(combined.values(), key=lambda x: x['hybrid_score'], reverse=True)
        return sorted_results[:k]

    def generate_answer(self, query: str, retrieved_docs: List[Dict[str, Any]],
                       max_context_length: int = 2000) -> str:
        """
        Generate answer using retrieved documents as context

        Args:
            query: Input question
            retrieved_docs: Retrieved document chunks
            max_context_length: Maximum length of context to include

        Returns:
            Generated answer
        """
        # Build context from retrieved documents
        context_parts = []
        current_length = 0

        for doc in retrieved_docs:
            doc_text = f"Source: {doc.get('title', 'Unknown')}\n{doc['text']}\n\n"
            if current_length + len(doc_text) <= max_context_length:
                context_parts.append(doc_text)
                current_length += len(doc_text)
            else:
                break

        context = "".join(context_parts)

        # Simple template-based answer generation
        # In a real system, you would use a proper LLM here
        prompt = f"""Based on the following context, answer the question: {query}

Context:
{context}

Answer:"""

        # For now, return a simple response indicating what was found
        # In practice, you would call an LLM API here
        sources = [doc.get('title', 'Unknown') for doc in retrieved_docs[:3]]
        return f"Based on the retrieved documents from {', '.join(sources)}, I found relevant information about your question. [This is a placeholder - in a real system, an LLM would generate the actual answer here.]"

    def answer_question(self, query: str, k: int = 5, method: str = "hybrid") -> Dict[str, Any]:
        """
        Complete RAG pipeline: retrieve documents and generate answer

        Args:
            query: Input question
            k: Number of documents to retrieve
            method: Retrieval method

        Returns:
            Dictionary containing answer and retrieved documents
        """
        # Retrieve relevant documents
        retrieved_docs = self.retrieve_documents(query, k, method)

        # Generate answer
        answer = self.generate_answer(query, retrieved_docs)

        return {
            'query': query,
            'answer': answer,
            'retrieved_documents': retrieved_docs,
            'num_retrieved': len(retrieved_docs)
        }


def main():
    """Example usage of the RAG system"""
    # Initialize RAG system
    corpus_path = "data/corpus/chunks.jsonl"
    rag = RAGSystem(corpus_path)

    # Example questions
    questions = [
        "When was Carnegie Mellon University founded?",
        "What is the population of Pittsburgh?",
        "What are some popular events in Pittsburgh?",
        "What sports teams are in Pittsburgh?"
    ]

    # Answer questions
    for question in questions:
        print(f"\nQuestion: {question}")
        result = rag.answer_question(question, k=3)
        print(f"Answer: {result['answer']}")
        print(f"Retrieved {result['num_retrieved']} documents")
        print("-" * 50)


if __name__ == "__main__":
    main()
