In [1]:
%pip install -q pandas==2.2.2 tqdm==4.66.4 python-dateutil==2.9.0.post0


Note: you may need to restart the kernel to use updated packages.


In [4]:
import os, sqlite3, csv
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from dateutil import parser as dtparse


BASE_DIR = os.path.expanduser("~/Downloads/news_personalization/data")  # change if needed
DB_PATH = Path("data/news.db")


In [5]:
# start clean
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
if DB_PATH.exists():
    DB_PATH.unlink()

SCHEMA = """
PRAGMA journal_mode=WAL;
PRAGMA foreign_keys=ON;

CREATE TABLE IF NOT EXISTS news (
  news_id TEXT PRIMARY KEY,
  category TEXT,
  subcategory TEXT,
  title TEXT,
  abstract TEXT,
  url TEXT
);

CREATE TABLE IF NOT EXISTS users (
  user_id TEXT PRIMARY KEY
);

-- NOTE: impression_id is TEXT now (e.g., 'train_123456', 'dev_987654')
CREATE TABLE IF NOT EXISTS impressions (
  impression_id TEXT PRIMARY KEY,
  user_id TEXT REFERENCES users(user_id),
  time_utc TEXT
);

CREATE TABLE IF NOT EXISTS impression_items (
  impression_id TEXT REFERENCES impressions(impression_id),
  news_id TEXT REFERENCES news(news_id),
  label INTEGER,
  rank_in_impr INTEGER,
  PRIMARY KEY (impression_id, news_id)
);
"""

In [6]:
def init_db():
    conn = sqlite3.connect(DB_PATH)
    conn.executescript(SCHEMA)
    return conn

def read_news(dir_path: Path) -> pd.DataFrame:
    cols = ["news_id","category","subcategory","title","abstract",
            "url","title_entities","abstract_entities"]
    df = pd.read_csv(dir_path/"news.tsv", sep="\t", header=None,
                     names=cols, quoting=csv.QUOTE_NONE)
    return df[["news_id","category","subcategory","title","abstract","url"]]

def read_behaviors(dir_path: Path) -> pd.DataFrame:
    cols = ["impression_id","user_id","time","history","impressions"]
    df = pd.read_csv(dir_path/"behaviors.tsv", sep="\t", header=None, names=cols, engine="python")
    return df

def read_behaviors_split(dir_path: Path, split: str) -> pd.DataFrame:
    df = read_behaviors(dir_path).copy()
    df["split"] = split
    df["impr_id"] = df["split"] + "_" + df["impression_id"].astype(str)
    return df

def parse_time_utc(s):
    try:
        return dtparse.parse(s).isoformat()
    except Exception:
        return None

In [7]:
# validate paths
train_dir = Path(BASE_DIR)/"MINDsmall_train"
dev_dir   = Path(BASE_DIR)/"MINDsmall_dev"
assert train_dir.exists() and dev_dir.exists(), "Fix BASE_DIR: train/dev folders not found."


In [8]:
# init DB
conn = init_db()

# load news (dedup across splits)
news = pd.concat([read_news(train_dir), read_news(dev_dir)], axis=0).drop_duplicates("news_id")
news.to_sql("news", conn, if_exists="append", index=False)

# load behaviors with split-aware IDs
beh_train = read_behaviors_split(train_dir, "train")
beh_dev   = read_behaviors_split(dev_dir, "dev")
beh = pd.concat([beh_train, beh_dev], axis=0, ignore_index=True)


In [9]:
# users
users = pd.DataFrame({"user_id": beh["user_id"].unique()})
users.to_sql("users", conn, if_exists="append", index=False)

# impressions (use impr_id as primary key)
impr = beh[["impr_id","user_id","time"]].copy()
impr["time_utc"] = impr["time"].apply(parse_time_utc)
impr = impr.drop(columns=["time"]).rename(columns={"impr_id":"impression_id"})
impr.to_sql("impressions", conn, if_exists="append", index=False)

# impression_items (explode with the split-aware impression_id)
rows = []
for impr_id, imps in tqdm(beh[["impr_id","impressions"]].itertuples(index=False),
                          total=len(beh), desc="Exploding items"):
    if pd.isna(imps) or not str(imps).strip():
        continue
    for rank, token in enumerate(str(imps).split()):
        try:
            nid, lbl = token.rsplit("-", 1)
            rows.append((impr_id, nid, int(lbl), rank))
        except Exception:
            continue
items = pd.DataFrame(rows, columns=["impression_id","news_id","label","rank_in_impr"])
items.to_sql("impression_items", conn, if_exists="append", index=False)

conn.execute("ANALYZE;")
conn.close()
print(f"All done. SQLite DB at {DB_PATH.resolve()}")

Exploding items: 100%|███████████████| 230117/230117 [00:03<00:00, 76645.11it/s]


All done. SQLite DB at /Users/trisharaj/Downloads/news_personalization/src/data/news.db


In [10]:
conn = sqlite3.connect("data/news.db")
print("News:", conn.execute("select count(*) from news").fetchone()[0])
print("Users:", conn.execute("select count(*) from users").fetchone()[0])
print("Impressions:", conn.execute("select count(*) from impressions").fetchone()[0])
print("Items:", conn.execute("select count(*) from impression_items").fetchone()[0])

print("\nSample impressions:")
for r in conn.execute("select impression_id, user_id, substr(time_utc,1,19) from impressions limit 5"):
    print(r)
conn.close()


News: 65238
Users: 94057
Impressions: 230117
Items: 8584442

Sample impressions:
('train_1', 'U13740', '2019-11-11T09:05:58')
('train_2', 'U91836', '2019-11-12T18:11:30')
('train_3', 'U73700', '2019-11-14T07:01:48')
('train_4', 'U34670', '2019-11-11T05:28:05')
('train_5', 'U8125', '2019-11-12T16:11:21')


In [12]:
%pip install -q sentence-transformers==3.0.1 faiss-cpu==1.12.0 numpy==1.26.4



Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -q sentence-transformers==3.0.1 faiss-cpu==1.12.0 numpy==1.26.4
import os, json, torch
from pathlib import Path
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(1)

# Use your existing DB (from your screenshot it's in ./data/news.db)
from pathlib import Path
try:
    DB_PATH
except NameError:
    DB_PATH = Path("data/news.db").resolve()
print("DB:", DB_PATH, "exists:", DB_PATH.exists())

ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)


Note: you may need to restart the kernel to use updated packages.
DB: /Users/trisharaj/Downloads/news_personalization/src/data/news.db exists: True


In [8]:
import sqlite3, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer

# Clean any half-written artifacts
for fn in ["news_embs.dat","news_meta.csv","index_info.json","news.faiss"]:
    p = ART_DIR / fn
    if p.exists(): p.unlink()

# 1) Load texts
conn = sqlite3.connect(str(DB_PATH))
df = pd.read_sql_query("""
  SELECT news_id, COALESCE(title,'') AS title, COALESCE(abstract,'') AS abstract,
         COALESCE(url,'') AS url, COALESCE(category,'') AS category,
         COALESCE(subcategory,'') AS subcategory
  FROM news
""", conn)
conn.close()
df["text"] = (df["title"].str.strip() + " [SEP] " + df["abstract"].str.strip()).str.strip()
df = df[df["text"].ne("[SEP]")].reset_index(drop=True)

# --- Start with a small subset to avoid kernel restarts ---
LIMIT = None     # try with 5k items first; set to None later for full build
if LIMIT: df = df.head(LIMIT).copy()

# 2) Embed to a memory-mapped file (won't blow RAM)
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME, device="cpu")
dim = model.get_sentence_embedding_dimension()

n = len(df)
emb_path = ART_DIR / "news_embs.dat"
embs_mm = np.memmap(emb_path, dtype="float32", mode="w+", shape=(n, dim))

batch_size = 32   # small & safe
for start in range(0, n, batch_size):
    texts = df["text"].iloc[start:start+batch_size].tolist()
    embs = model.encode(
        texts, batch_size=32, convert_to_numpy=True,
        normalize_embeddings=True, show_progress_bar=False
    ).astype("float32", copy=False)
    embs_mm[start:start+len(embs)] = embs
embs_mm.flush()

# 3) Save metadata + info (even for the subset)
meta_cols = ["news_id","title","abstract","url","category","subcategory"]
df[meta_cols].to_csv(ART_DIR / "news_meta.csv", index=False)
with open(ART_DIR / "index_info.json","w") as f:
    json.dump({"model": MODEL_NAME, "dim": dim, "items": int(n), "faiss": False}, f, indent=2)

print(f"Subset embeddings written. Items: {n}, dim: {dim}")


Subset embeddings written. Items: 65238, dim: 384


In [6]:
import json, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

info = json.load(open(ART_DIR/"index_info.json"))
meta = pd.read_csv(ART_DIR/"news_meta.csv")
embs = np.memmap(ART_DIR/"news_embs.dat", dtype="float32", mode="r",
                 shape=(info["items"], info["dim"]))

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

def search_numpy(q, k=5):
    qv = model.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0].astype("float32")
    scores = embs @ qv  # cosine (vectors are normalized)
    idx = np.argpartition(-scores, k)[:k]
    idx = idx[np.argsort(-scores[idx])]
    return [(float(scores[i]), int(i)) for i in idx]

def show(q, k=5):
    print(f"\nQuery: {q}")
    for s,i in search_numpy(q, k):
        r = meta.iloc[i]
        print(f"{s:.3f} • {r.title} [{r.category}]")
        print(r.url, "\n")

show("AI safety and regulation news", k=5)
show("Premier League transfer updates", k=3)



Query: AI safety and regulation news
0.389 • App developed in Indiana could be gamechanger for personal and school safety [news]
https://assets.msn.com/labs/mind/AAJrIub.html 

0.375 • Safest New Vehicles in America [autos]
https://assets.msn.com/labs/mind/AAGt1jE.html 

0.354 • America's Unsafe Medical Products [health]
https://assets.msn.com/labs/mind/AAIiqOm.html 

0.349 • Latest Automotive Safety Recalls [autos]
https://assets.msn.com/labs/mind/AAFfB8g.html 

0.345 • AP FACT CHECK: Trump team distortions on fuel economy rules [news]
https://assets.msn.com/labs/mind/AAHy645.html 


Query: Premier League transfer updates
0.408 • Philadelphia Union transfer rumor tracker [sports]
https://assets.msn.com/labs/mind/AADapnX.html 

0.371 • Kings Rumors: Quick's future "up in the air," Kovalchuk trade-bait? [sports]
https://assets.msn.com/labs/mind/AAJ4Ptv.html 

0.344 • Women's Champions League Roundup: Hegerberg Equals Record, City Held & Arsenal Net Five [sports]
https://assets.msn.com/

  scores = embs @ qv  # cosine (vectors are normalized)
  scores = embs @ qv  # cosine (vectors are normalized)
  scores = embs @ qv  # cosine (vectors are normalized)


In [9]:
%pip install -q nltk==3.9.1 scikit-learn==1.5.1
import nltk; nltk.download("punkt", quiet=True)


Note: you may need to restart the kernel to use updated packages.


True

In [10]:
from pathlib import Path
import json, numpy as np, pandas as pd

ART_DIR = Path("artifacts")
meta = pd.read_csv(ART_DIR/"news_meta.csv")
info = json.load(open(ART_DIR/"index_info.json"))
dim  = int(info["dim"])
n    = len(meta)

# Load NumPy embeddings (always available from Step 3 safe-mode)
embs = np.memmap(ART_DIR/"news_embs.dat", dtype="float32", mode="r", shape=(n, dim))

# Optional FAISS
_use_faiss = (ART_DIR/"news.faiss").exists() and info.get("faiss", False)
if _use_faiss:
    import faiss
    _faiss_index = faiss.read_index(str(ART_DIR/"news.faiss"))

from sentence_transformers import SentenceTransformer
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

def retrieve(query: str, k: int = 8) -> pd.DataFrame:
    """Return top-k articles with scores."""
    qv = _model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0].astype("float32")
    if _use_faiss:
        scores, idxs = _faiss_index.search(qv.reshape(1,-1), k)
        idxs, scores = idxs[0], scores[0]
    else:
        scores = embs @ qv
        idxs = np.argpartition(-scores, k)[:k]
        idxs = idxs[np.argsort(-scores[idxs])]
        scores = scores[idxs]
    out = meta.iloc[idxs].copy()
    out["score"] = [float(s) for s in scores]
    return out[["score","news_id","title","abstract","url","category","subcategory"]]


In [12]:
import nltk
# grab both resources for sentence tokenization
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)


True

In [13]:
import re, nltk
try:
    nltk.download("punkt", quiet=True)
    nltk.download("punkt_tab", quiet=True)
    from nltk.tokenize import sent_tokenize as _sent_tokenize
except Exception:
    # simple regex fallback if NLTK data can't be fetched
    def _sent_tokenize(text: str):
        return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if len(s.split()) >= 3]


In [14]:
import re, math, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

_sent_tokenize = nltk.sent_tokenize

def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def _mmr(query_vec, sent_mat, lambda_=0.7, k=6):
    """Maximal Marginal Relevance: pick k diverse, relevant sentences."""
    selected, candidates = [], list(range(sent_mat.shape[0]))
    sim_q = cosine_similarity(sent_mat, query_vec.reshape(1,-1)).ravel()
    if len(candidates) == 0: return []
    selected.append(int(sim_q.argmax()))
    candidates.remove(selected[0])
    while len(selected) < min(k, sent_mat.shape[0]) and candidates:
        cand_scores = []
        for c in candidates:
            redundancy = max([cosine_similarity(sent_mat[c:c+1], sent_mat[s:s+1])[0,0] for s in selected])
            score = lambda_ * sim_q[c] - (1 - lambda_) * redundancy
            cand_scores.append((score, c))
        cidx = max(cand_scores)[1]
        selected.append(cidx)
        candidates.remove(cidx)
    return selected

def rag_answer(query: str, k_docs: int = 8, max_sents: int = 6):
    hits = retrieve(query, k=k_docs)

    # Build candidate sentences from titles + abstracts
    docs = []
    for _, r in hits.iterrows():
        text = _clean(f"{r['title']}. {r['abstract']}")
        sents = [s for s in _sent_tokenize(text) if len(s.split()) >= 6]
        if not sents: 
            # fallback: treat title as a sentence
            sents = [_clean(r["title"])]
        docs.append({"url": r["url"], "title": r["title"], "sents": sents})

    # Flatten sentences with their source index
    all_sents, src_idx = [], []
    for i, d in enumerate(docs):
        for s in d["sents"]:
            all_sents.append(s)
            src_idx.append(i)

    if not all_sents:
        return "No supporting sentences found.", []

    # TF-IDF space for query + sentences
    vec = TfidfVectorizer(stop_words="english", max_features=12000)
    mat = vec.fit_transform([query] + all_sents)
    query_vec, sent_mat = mat[0:1], mat[1:]

    # Pick sentences with MMR (diverse + on-topic)
    keep = _mmr(query_vec.toarray()[0], sent_mat.toarray(), lambda_=0.72, k=max_sents)

    # Compose answer paragraph
    chosen = [all_sents[i] for i in keep]
    answer = " ".join(chosen)

    # Build compact citations (unique by source in selection order)
    cited = []
    used = set()
    for i in keep:
        src = docs[src_idx[i]]
        key = src["url"]
        if key and key not in used:
            used.add(key)
            cited.append((src["title"], src["url"]))

    return answer, cited, hits

# --- demo ---
for q in ["AI safety and regulation news", "Premier League transfer updates"]:
    ans, cites, hits = rag_answer(q, k_docs=8, max_sents=5)
    print("\n=== Query:", q, "===")
    print(ans or "(no answer)")
    print("\nSources:")
    for i,(t,u) in enumerate(cites, 1):
        print(f"[{i}] {t}\n    {u}")



=== Query: AI safety and regulation news ===
The FDA issues safety alerts to provide timely new safety information on human drugs, medical devices, vaccines and other products. These Researchers Are Using AI Drones to More Safely Track Wildlife. House leaders demand FAA answer why it overruled its own engineers' safety concerns about Boeing 737 Max. OpenAI has published the text-generating AI it said was too dangerous to share. Pentagon's draft AI ethics guidelines fight bias and rogue machines.

Sources:
[1] America's Unsafe Drugs and Medical Products
    https://assets.msn.com/labs/mind/AAJX7PM.html
[2] These Researchers Are Using AI Drones to More Safely Track Wildlife
    https://assets.msn.com/labs/mind/AAJGxIV.html
[3] House leaders demand FAA answer why it overruled its own engineers' safety concerns about Boeing 737 Max
    https://assets.msn.com/labs/mind/BBWram0.html
[4] OpenAI has published the text-generating AI it said was too dangerous to share
    https://assets.msn.com

  scores = embs @ qv
  scores = embs @ qv
  scores = embs @ qv
  # that has no feature names.
  ret = np.dot(a, b)
  ret = np.dot(a, b)
  ret = np.dot(a, b)
  scores = embs @ qv
  scores = embs @ qv
  scores = embs @ qv
  # that has no feature names.
  ret = np.dot(a, b)
  ret = np.dot(a, b)
  ret = np.dot(a, b)


In [15]:
import sqlite3, json
from pathlib import Path
import pandas as pd, numpy as np

ART_DIR = Path("artifacts")
meta = pd.read_csv(ART_DIR/"news_meta.csv")
with open(ART_DIR/"index_info.json") as f: info = json.load(f)
dim = int(info["dim"])

# map news_id -> row in embedding matrix
id2row = {nid: i for i, nid in enumerate(meta.news_id)}

# open embeddings (works for subset or full)
embs = np.memmap(ART_DIR/"news_embs.dat", dtype="float32", mode="r",
                 shape=(len(meta), dim))

# pull train clicks
conn = sqlite3.connect(str(DB_PATH))
df = pd.read_sql_query("""
SELECT im.user_id, im.impression_id, it.news_id, it.label
FROM impression_items it
JOIN impressions im ON im.impression_id = it.impression_id
WHERE im.impression_id LIKE 'train_%'
""", conn)
conn.close()

# keep only items we have embeddings for
df = df[df.news_id.isin(id2row)]

# clicked lists per user
clicked = df[df.label == 1].groupby("user_id")["news_id"].apply(list)

def make_user_vec(news_ids):
    rows = [id2row[n] for n in news_ids if n in id2row]
    if not rows: return None
    v = np.mean(embs[rows], axis=0)
    nrm = np.linalg.norm(v) + 1e-12
    return (v / nrm).astype("float32")

min_clicks = 3
counts = clicked.apply(len)
profiles = {u: make_user_vec(nids)
            for u, nids in clicked.items()
            if len(nids) >= min_clicks}

# drop empty
profiles = {u:v for u,v in profiles.items() if v is not None}
print("User profiles built:", len(profiles))


User profiles built: 27678


In [18]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

def _clean_vec(v):
    v = np.nan_to_num(v, nan=0.0, posinf=0.0, neginf=0.0).astype("float32", copy=False)
    n = np.linalg.norm(v)
    return v if n == 0 else (v / n)

def search_personalized(user_id: str, query: str, k: int = 5, alpha: float = 0.6):
    qv = _clean_vec(model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0])
    uv = profiles.get(user_id)
    if uv is not None:
        uv = _clean_vec(uv)

    # query-only fallback if no profile
    base = embs @ qv
    if uv is None:
        scores = base
    else:
        scores = alpha * base + (1 - alpha) * (np.nan_to_num(embs, nan=0.0) @ uv)

    idx = np.argpartition(-scores, k)[:k]
    idx = idx[np.argsort(-scores[idx])]
    out = meta.iloc[idx].copy()
    out["score"] = [float(scores[i]) for i in idx]
    return out[["score", "news_id", "title", "url", "category"]]


In [19]:
import numpy as np

hits, total = 0, 0
k = 10
for impr_id, g in groups:
    user = g.user_id.iloc[0]
    uv = profiles.get(user)
    if uv is None: 
        continue
    # sanitize vectors
    uv = _clean_vec(uv)
    rows = np.array([id2row[nid] for nid in g.news_id])
    cand_embs = np.nan_to_num(embs[rows], nan=0.0, posinf=0.0, neginf=0.0)

    scores = cand_embs @ uv
    clicked_idx = np.where(g.label.values == 1)[0]
    if len(clicked_idx) == 0:
        continue
    topk = np.argpartition(-scores, min(k, len(scores)-1))[:k]
    total += 1
    hits += int(any(ci in topk for ci in clicked_idx))

print(f"Personalized Hit@{k}: {hits}/{total} = {hits/total:.3f}")


  scores = cand_embs @ uv
  scores = cand_embs @ uv
  scores = cand_embs @ uv


Personalized Hit@10: 4593/6622 = 0.694


In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

def search_personalized(user_id: str, query: str, k: int = 5, alpha: float = 0.6):
    """alpha balances query vs user taste (0..1)."""
    qv = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0].astype("float32")
    uv = profiles.get(user_id)

    # query-only fallback if we don't have a profile for this user
    if uv is None:
        scores = embs @ qv
    else:
        scores = alpha * (embs @ qv) + (1 - alpha) * (embs @ uv)

    idx = np.argpartition(-scores, k)[:k]
    idx = idx[np.argsort(-scores[idx])]
    out = meta.iloc[idx].copy()
    out["score"] = [float(scores[i]) for i in idx]
    return out[["score", "news_id", "title", "url", "category"]]

# try a sample user who has a profile
sample_user = next(iter(profiles.keys()))
print("Sample user with profile:", sample_user, " (clicks ≈", counts.get(sample_user, "?"), ")")

print("\n— Query-only —")
display(search_personalized(user_id="__no_profile__", query="AI safety and regulation news", k=5, alpha=1.0))
print("\n— Personalized (α=0.6) —")
display(search_personalized(user_id=sample_user, query="AI safety and regulation news", k=5, alpha=0.6))


Sample user with profile: U1000  (clicks ≈ 4 )

— Query-only —


  scores = embs @ qv
  scores = embs @ qv
  scores = embs @ qv


Unnamed: 0,score,news_id,title,url,category
57711,0.52928,N17592,Activists warn UN about dangers of using AI to...,https://assets.msn.com/labs/mind/AAJbn6h.html,news
33038,0.519284,N4157,OpenAI has published the text-generating AI it...,https://assets.msn.com/labs/mind/BBWpLyz.html,news
8696,0.48009,N9822,America's Unsafe Drugs and Medical Products,https://assets.msn.com/labs/mind/AAJX7PM.html,health
53040,0.479708,N13148,Pentagon's draft AI ethics guidelines fight bi...,https://assets.msn.com/labs/mind/AAJKAXe.html,news
15444,0.4393,N23750,Boeing pilots' messages on 737 MAX safety rais...,https://assets.msn.com/labs/mind/AAIZWlG.html,video



— Personalized (α=0.6) —


  scores = alpha * (embs @ qv) + (1 - alpha) * (embs @ uv)
  scores = alpha * (embs @ qv) + (1 - alpha) * (embs @ uv)
  scores = alpha * (embs @ qv) + (1 - alpha) * (embs @ uv)


Unnamed: 0,score,news_id,title,url,category
33595,0.390778,N53875,"California crisis of fires, blackouts decades ...",https://assets.msn.com/labs/mind/BBWJYlb.html,news
26115,0.378203,N35694,A day in the life of Indianapolis: 24 hours th...,https://assets.msn.com/labs/mind/AAJdwR0.html,travel
36868,0.36059,N61296,"California's wildfire, blackout crisis: Who's ...",https://assets.msn.com/labs/mind/BBWKRmS.html,news
15444,0.349237,N23750,Boeing pilots' messages on 737 MAX safety rais...,https://assets.msn.com/labs/mind/AAIZWlG.html,video
8696,0.348501,N9822,America's Unsafe Drugs and Medical Products,https://assets.msn.com/labs/mind/AAJX7PM.html,health


In [21]:
import sqlite3, numpy as np, pandas as pd

# load dev candidates
conn = sqlite3.connect(str(DB_PATH))
dev = pd.read_sql_query("""
SELECT im.user_id, im.impression_id, it.news_id, it.label
FROM impression_items it
JOIN impressions im ON im.impression_id = it.impression_id
WHERE im.impression_id LIKE 'dev_%'
""", conn)
conn.close()

# filter to users we have profiles for and news with embeddings
dev = dev[dev.user_id.isin(profiles.keys())]
dev = dev[dev.news_id.isin(id2row)]

# group by impression
groups = dev.groupby("impression_id")

hits, total = 0, 0
k = 10
for impr_id, g in groups:
    user = g.user_id.iloc[0]
    uv = profiles.get(user)
    if uv is None: 
        continue

    rows = np.array([id2row[nid] for nid in g.news_id])
    cand_embs = embs[rows]
    scores = cand_embs @ uv  # cosine vs user profile
    # find clicked items (label==1) if any
    clicked_idx = np.where(g.label.values == 1)[0]
    if len(clicked_idx) == 0:
        continue

    topk = np.argpartition(-scores, min(k, len(scores)-1))[:k]
    total += 1
    hits += int(any(ci in topk for ci in clicked_idx))

print(f"Personalized Hit@{k}: {hits}/{total} = {hits/total:.3f} (only dev impressions with known user & embedded items)")


  scores = cand_embs @ uv  # cosine vs user profile
  scores = cand_embs @ uv  # cosine vs user profile
  scores = cand_embs @ uv  # cosine vs user profile


Personalized Hit@10: 4593/6622 = 0.694 (only dev impressions with known user & embedded items)


In [22]:
import sqlite3, pandas as pd
from pathlib import Path

ART_DIR = Path("artifacts")
DB_PATH = DB_PATH  # from earlier cells

conn = sqlite3.connect(str(DB_PATH))
recency = pd.read_sql_query("""
SELECT it.news_id, MAX(im.time_utc) AS last_seen_utc
FROM impression_items it
JOIN impressions im ON im.impression_id = it.impression_id
GROUP BY it.news_id
""", conn)
conn.close()

# Load meta (from Step 3)
meta = pd.read_csv(ART_DIR / "news_meta.csv")
meta = meta.merge(recency, on="news_id", how="left")
meta["last_seen_utc"] = pd.to_datetime(meta["last_seen_utc"])
meta.head(2)


Unnamed: 0,news_id,title,abstract,url,category,subcategory,last_seen_utc
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,lifestyle,lifestyleroyals,NaT
1,N19639,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,health,weightloss,NaT


In [23]:
import numpy as np, pandas as pd, json, re
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from datetime import datetime, timedelta, timezone

# Load embeddings + model (from Step 3 safe-mode)
info = json.load(open(ART_DIR / "index_info.json"))
dim  = int(info["dim"])
embs = np.memmap(ART_DIR / "news_embs.dat", dtype="float32", mode="r",
                 shape=(len(meta), dim))

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

# user profiles dict "profiles" is created in Step 5; if missing, use empty
try:
    profiles
except NameError:
    profiles = {}

def _clean_vec(v):
    v = np.nan_to_num(v, nan=0.0, posinf=0.0, neginf=0.0).astype("float32", copy=False)
    n = np.linalg.norm(v)
    return v if n == 0 else (v / n)

def retrieve_personalized(query:str, user_id:str|None, k:int=8, alpha:float=0.6,
                          last_days:int=7):
    # recency filter
    cutoff = pd.Timestamp.utcnow().tz_localize(None) - pd.Timedelta(days=last_days)
    pool = meta[meta["last_seen_utc"].fillna(pd.Timestamp("1970-01-01")) >= cutoff]
    if pool.empty:
        pool = meta  # fallback

    qv = _clean_vec(model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0])
    base_scores = (embs @ qv)

    uv = None if not user_id else profiles.get(user_id)
    if uv is not None:
        uv = _clean_vec(uv)
        pers_scores = (embs @ uv)
        scores = alpha * base_scores + (1 - alpha) * pers_scores
    else:
        scores = base_scores

    idx_pool = pool.index.to_numpy()
    pool_scores = scores[idx_pool]
    top_idx = idx_pool[np.argpartition(-pool_scores, min(k, len(pool_scores)-1))[:k]]
    top_idx = top_idx[np.argsort(-scores[top_idx])]
    out = meta.iloc[top_idx].copy()
    out["score"] = [float(scores[i]) for i in top_idx]
    return out[["score","news_id","title","abstract","url","category","last_seen_utc"]]

# one-line, query-focused summary
def one_line_summary(query:str, title:str, abstract:str, max_chars:int=180):
    text = f"{title}. {abstract or ''}".strip()
    # TF-IDF pick best sentence vs query
    sents = re.split(r'(?<=[.!?])\s+', text)
    vec = TfidfVectorizer(stop_words="english").fit([query] + sents)
    m = vec.transform([query] + sents).toarray()
    qv, sv = m[0], m[1:]
    if len(sv) == 0:
        best = title
    else:
        sims = (sv @ qv) / (np.linalg.norm(sv, axis=1) * np.linalg.norm(qv) + 1e-12)
        best = sents[int(np.argmax(sims))].strip()
    # trim
    if len(best) > max_chars:
        best = best[: max_chars-1].rstrip() + "…"
    return best


In [24]:
from datetime import datetime
from pathlib import Path

def daily_brief(user_id:str|None,
                sections:list[tuple[str,str]] = (
                    ("For You", "top news"),
                    ("World", "world news"),
                    ("Technology", "technology"),
                    ("Sports", "sports updates"),
                ),
                k_per_section:int = 4,
                alpha:float = 0.6,
                last_days:int = 7,
                save:bool = True):
    lines = []
    today = datetime.utcnow().strftime("%Y-%m-%d")
    header = f"# Daily Brief — {today}\n"
    if user_id and user_id in profiles:
        header += f"_Personalized for user `{user_id}` (α={alpha}, last {last_days}d)_\n\n"
    else:
        header += "_Non-personalized (no profile found)_\n\n"
    lines.append(header)

    for sec_title, sec_query in sections:
        lines.append(f"## {sec_title}\n")
        hits = retrieve_personalized(sec_query, user_id, k=k_per_section,
                                     alpha=alpha, last_days=last_days)
        for i, r in hits.iterrows():
            summ = one_line_summary(sec_query, r["title"], r["abstract"])
            when = r["last_seen_utc"]
            when_str = when.strftime("%Y-%m-%d %H:%M") if pd.notna(when) else "—"
            lines.append(f"- **{r['title']}**  \n  {summ}  \n  [Source]({r['url']}) • {r['category']} • seen: {when_str}")
        lines.append("")  # spacer

    md = "\n".join(lines)
    if save:
        out = Path("artifacts") / f"daily_brief_{(user_id or 'guest')}_{today}.md"
        out.write_text(md, encoding="utf-8")
        print("Saved:", out)
    display_md = md.replace("  \n", "<br>")  # nicer in notebook
    from IPython.display import Markdown, display
    display(Markdown(display_md))
    return md

# Choose a user with a profile, else pass None for non-personalized
try:
    sample_user
except NameError:
    sample_user = next(iter(profiles.keys()), None)

_ = daily_brief(user_id=sample_user, k_per_section=4, alpha=0.6, last_days=30)


Saved: artifacts/daily_brief_U1000_2025-09-17.md


  today = datetime.utcnow().strftime("%Y-%m-%d")
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.


# Daily Brief — 2025-09-17
_Personalized for user `U1000` (α=0.6, last 30d)_


## For You

- **Top Aurora news: Suspect in custody for attack on security guard; voters get ballot with error; more**<br>  Here's the most recent top news in Aurora; see what headlines are trending among local readers, with links to full articles.<br>  [Source](https://assets.msn.com/labs/mind/AAIW4jK.html) • news • seen: —
- **Top Charlotte news: 2 dead, 1 injured in home shooting; girls hope to feed 100 families; more**<br>  Here's the most recent top news in Charlotte; see what headlines are trending among local readers, with links to full articles.<br>  [Source](https://assets.msn.com/labs/mind/BBWB9ui.html) • news • seen: 2019-11-11 10:14
- **Top Seattle news: Reward offered for info on school fire; report on luxury housing boom; more**<br>  Here's the most recent top news in Seattle; see what headlines are trending among local readers, with links to full articles.<br>  [Source](https://assets.msn.com/labs/mind/AAJeNun.html) • news • seen: —
- **Top Austin news: Crews begin clean-up of homeless camps; fired cops seek reinstatement; more**<br>  Here's the most recent top news in Austin; see what headlines are trending among local readers, with links to full articles.<br>  [Source](https://assets.msn.com/labs/mind/AAJTp5y.html) • news • seen: —

## World

- **Morning news briefing**<br>  Morning news briefing.<br>  [Source](https://assets.msn.com/labs/mind/AAJ72jp.html) • video • seen: —
- **Today in History: November 14**<br>  Today in History: November 14.<br>  [Source](https://assets.msn.com/labs/mind/BBWpwU6.html) • news • seen: 2019-11-14 16:42
- **WJZ Morning News & Weather Update**<br>  WJZ Morning News & Weather Update.<br>  [Source](https://assets.msn.com/labs/mind/AAJ7eWa.html) • weather • seen: —
- **Today in History: November 15**<br>  Today in History: November 15.<br>  [Source](https://assets.msn.com/labs/mind/BBWHp6q.html) • news • seen: 2019-11-15 21:05

## Technology

- **Obsoletely flawless: From smart glasses to 3D TVs, the tech products from the start of the decade that haven't stood the test of time**<br>  Obsoletely flawless: From smart glasses to 3D TVs, the tech products from the start of the decade that haven't stood the test of time.<br>  [Source](https://assets.msn.com/labs/mind/BBWKWBY.html) • news • seen: 2019-11-14 22:09
- **California crisis of fires, blackouts decades in the making**<br>  California crisis of fires, blackouts decades in the making.<br>  [Source](https://assets.msn.com/labs/mind/BBWJYlb.html) • news • seen: 2019-11-14 07:54
- **Israel Seeks Super-Forecasting Help on Where World Is Heading**<br>  Israel Seeks Super-Forecasting Help on Where World Is Heading.<br>  [Source](https://assets.msn.com/labs/mind/AAJuEyH.html) • news • seen: —
- **Scientists and researchers reveal 13 dark technology scenarios that keep them up at night**<br>  Scientists and researchers reveal 13 dark technology scenarios that keep them up at night.<br>  [Source](https://assets.msn.com/labs/mind/AAJBDsq.html) • news • seen: 2019-11-14 01:54

## Sports

- **The Day in Sports: Friday, November 15, 2019**<br>  The Day in Sports: Friday, November 15, 2019.<br>  [Source](https://assets.msn.com/labs/mind/BBWPSEO.html) • sports • seen: 2019-11-15 22:53
- **The Day in Sports: Friday, November 8, 2019**<br>  The Day in Sports: Friday, November 8, 2019.<br>  [Source](https://assets.msn.com/labs/mind/BBWu0Bs.html) • sports • seen: 2019-11-09 19:15
- **The Day in Sports: Friday, November 1, 2019**<br>  The Day in Sports: Friday, November 1, 2019.<br>  [Source](https://assets.msn.com/labs/mind/AAJIEnt.html) • sports • seen: —
- **The Day in Sports: Thursday, Nov 7, 2019**<br>  The Day in Sports: Thursday, Nov 7, 2019.<br>  [Source](https://assets.msn.com/labs/mind/BBWqH4M.html) • sports • seen: —


In [25]:
# Daily Brief with your sections
sections = [
    ("Business", "business news"),
    ("Finance", "stock market and finance"),
    ("Health", "health news"),
    ("Entertainment", "entertainment and movies"),
]

_ = daily_brief(
    user_id=sample_user,   # or None for non-personalized
    sections=sections,
    k_per_section=4,
    alpha=0.6,             # raise = more query focus; lower = more personal taste
    last_days=30
)


Saved: artifacts/daily_brief_U1000_2025-09-17.md


  today = datetime.utcnow().strftime("%Y-%m-%d")
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  base_scores = (embs @ qv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  pers_scores = (embs @ uv)
  # that has no feature names.


# Daily Brief — 2025-09-17
_Personalized for user `U1000` (α=0.6, last 30d)_


## Business

- **CNN Business Now**<br>  CNN Business Now.<br>  [Source](https://assets.msn.com/labs/mind/BBWEWka.html) • video • seen: 2019-11-12 12:07
- **CNN Business Now**<br>  CNN Business Now.<br>  [Source](https://assets.msn.com/labs/mind/BBWLD78.html) • video • seen: 2019-11-14 14:53
- **CNN Business Now**<br>  CNN Business Now.<br>  [Source](https://assets.msn.com/labs/mind/BBWBBto.html) • video • seen: 2019-11-11 19:49
- **Morning news briefing**<br>  Morning news briefing.<br>  [Source](https://assets.msn.com/labs/mind/AAJ72jp.html) • video • seen: —

## Finance

- **How the stock market works**<br>  How the stock market works.<br>  [Source](https://assets.msn.com/labs/mind/BBWLg42.html) • finance • seen: 2019-11-15 13:09
- **The 15 Best Recession-Resistant Stocks to Buy**<br>  The 15 Best Recession-Resistant Stocks to Buy.<br>  [Source](https://assets.msn.com/labs/mind/AAIu0Q5.html) • finance • seen: —
- **Analysis: Extreme greed reigns on Wall Street. Here's why investors are so confident**<br>  Analysis: Extreme greed reigns on Wall Street.<br>  [Source](https://assets.msn.com/labs/mind/AAJQmQx.html) • finance • seen: —
- **Over the past seven decades   this has been the best date for the stock market**<br>  Over the past seven decades   this has been the best date for the stock market.<br>  [Source](https://assets.msn.com/labs/mind/AAJteE9.html) • finance • seen: —

## Health

- **Health calendar: Nov. 14-21**<br>  Health calendar: Nov.<br>  [Source](https://assets.msn.com/labs/mind/BBWDHtV.html) • travel • seen: 2019-11-12 04:10
- **Report: Google gathering health information of millions of Americans**<br>  Report: Google gathering health information of millions of Americans.<br>  [Source](https://assets.msn.com/labs/mind/BBWEmnm.html) • news • seen: 2019-11-12 08:33
- **Morning news briefing**<br>  Morning news briefing.<br>  [Source](https://assets.msn.com/labs/mind/AAJ72jp.html) • video • seen: —
- **Wildfires; Power Shutoffs; Bad Santa; Sesame Place: CA Stories**<br>  Take a look at some of the top news stories over the past week from across California.<br>  [Source](https://assets.msn.com/labs/mind/AAJoYNG.html) • news • seen: —

## Entertainment

- **The week in entertainment history: Nov. 10-16**<br>  The week in entertainment history: Nov.<br>  [Source](https://assets.msn.com/labs/mind/AAJXkti.html) • entertainment • seen: 2019-11-14 20:59
- **The week in entertainment history: Oct. 27-Nov. 2**<br>  The week in entertainment history: Oct.<br>  [Source](https://assets.msn.com/labs/mind/AAJf87q.html) • entertainment • seen: —
- **The week in entertainment history: Oct. 13-19**<br>  The week in entertainment history: Oct.<br>  [Source](https://assets.msn.com/labs/mind/AAIwA1R.html) • entertainment • seen: —
- **Star Tracks: Celebs on Vacation**<br>  Star Tracks: Celebs on Vacation.<br>  [Source](https://assets.msn.com/labs/mind/AAISOZh.html) • entertainment • seen: —


In [33]:
%pip install -q "gradio==4.39.0"
import gradio, sys
print("Gradio version:", gradio.__version__)


Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'pyaudioop'

In [1]:
%pip install -q "gradio==4.39.0"


Note: you may need to restart the kernel to use updated packages.


In [4]:
# --- stub audio modules (we don't use audio features) ---
import sys, types
for name in ("audioop", "pyaudioop"):
    if name not in sys.modules:
        m = types.ModuleType(name)
        # provide no-op functions so imports succeed
        def _disabled(*a, **k): raise RuntimeError("Audio disabled in this app")
        for fn in ("rms","avg","add","mul","tostereo","tomono","getsample","lin2lin"):
            setattr(m, fn, _disabled)
        sys.modules[name] = m

import gradio as gr
import pandas as pd
from pathlib import Path

# ---- reuse your earlier helpers: profiles, retrieve_personalized, one_line_summary, daily_brief ----
try:
    sample_user
except NameError:
    sample_user = ""

def ui_search(user_id, query, alpha, k, last_days):
    user_id = user_id.strip() or None
    hits = retrieve_personalized(query, user_id, k=int(k), alpha=float(alpha), last_days=int(last_days))
    if hits.empty:
        return "No results."
    lines = []
    for _, r in hits.iterrows():
        summ = one_line_summary(query, r["title"], r["abstract"])
        when = r["last_seen_utc"]
        when_str = pd.to_datetime(when).strftime("%Y-%m-%d %H:%M") if pd.notna(when) else "—"
        lines.append(f"**{r['title']}**\n\n{summ}\n\n[Source]({r['url']}) • {r['category']} • seen: {when_str}\n")
    return "\n\n".join(lines)

def ui_brief(user_id, alpha, last_days, k):
    user_id = user_id.strip() or None
    sections = [
        ("Business", "business news"),
        ("Finance", "stock market and finance"),
        ("Health", "health news"),
        ("Entertainment", "entertainment and movies"),
    ]
    return daily_brief(user_id=user_id, sections=sections, k_per_section=int(k),
                       alpha=float(alpha), last_days=int(last_days), save=False)

with gr.Blocks() as demo:
    gr.Markdown("# 🗞️ News Personalization — Search & Daily Brief")
    with gr.Tabs():
        with gr.Tab("Semantic Search"):
            u = gr.Textbox(label="User ID (optional)", value=str(sample_user))
            q = gr.Textbox(label="Query", value="AI safety and regulation news")
            alpha = gr.Slider(0, 1, value=0.6, step=0.05, label="α: query vs. taste")
            k = gr.Slider(1, 10, value=5, step=1, label="Top K")
            days = gr.Slider(1, 60, value=30, step=1, label="Recency window (days)")
            btn = gr.Button("Search")
            out = gr.Markdown()
            btn.click(ui_search, [u, q, alpha, k, days], out)

        with gr.Tab("Daily Brief"):
            u2 = gr.Textbox(label="User ID (optional)", value=str(sample_user))
            alpha2 = gr.Slider(0, 1, value=0.6, step=0.05, label="α: query vs. taste")
            k2 = gr.Slider(1, 8, value=4, step=1, label="K per section")
            days2 = gr.Slider(1, 60, value=30, step=1, label="Recency window (days)")
            btn2 = gr.Button("Generate Brief")
            out2 = gr.Markdown()
            btn2.click(ui_brief, [u2, alpha2, days2, k2], out2)

demo.launch(inline=True, share=False)


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




--------
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/pydantic/type_adapter.py", line 271, in _init_core_attrs
    self.core_schema = _getattr_no_parents(self._type, '__pydantic_core_schema__')
                       ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/pydantic/type_adapter.py", line 55, in _getattr_no_parents
    raise AttributeError(attribute)
AttributeError: __pydantic_core_schema__

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        self.scope, self.receive, self.send
       

In [3]:
# --- Step 7: simple News Chatbot in Jupyter (ipywidgets) ---
%pip install -q ipywidgets==8.1.2
import ipywidgets as W, pandas as pd, numpy as np, re, json
from IPython.display import display, Markdown

# uses helpers you've already defined earlier: profiles, retrieve_personalized, one_line_summary
# we also reuse the sentence tokenizer setup from Step 4
try:
    _sent_tokenize
except NameError:
    import nltk, re
    try:
        nltk.download("punkt", quiet=True); nltk.download("punkt_tab", quiet=True)
        from nltk.tokenize import sent_tokenize as _sent_tokenize
    except Exception:
        def _sent_tokenize(text):  # simple fallback
            return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if len(s.split())>=3]

# small summarizer (same logic as earlier, but uses personalized retrieval)
def rag_answer_personalized(user_id, query, k_docs=8, max_sents=5, alpha=0.6, last_days=30):
    hits = retrieve_personalized(query, user_id, k=k_docs, alpha=alpha, last_days=last_days)
    if hits.empty:
        return "I couldn't find matching articles.", []
    # sentences from titles+abstracts
    docs = []
    for _, r in hits.iterrows():
        text = f"{(r['title'] or '').strip()}. {(r['abstract'] or '').strip()}".strip()
        sents = [s for s in _sent_tokenize(text) if len(s.split()) >= 6] or [r["title"]]
        docs.append({"title": r["title"], "url": r["url"], "sents": sents})

    # TF-IDF + MMR pick
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    all_sents, src_idx = [], []
    for i,d in enumerate(docs):
        for s in d["sents"]:
            all_sents.append(s); src_idx.append(i)
    vec = TfidfVectorizer(stop_words="english", max_features=12000)
    mat = vec.fit_transform([query] + all_sents)
    qv, sm = mat[0:1].toarray()[0], mat[1:].toarray()

    # MMR
    def mmr(qv, sm, k=6, lam=0.72):
        import numpy as np
        if sm.shape[0]==0: return []
        sim_q = (sm @ qv) / (np.linalg.norm(sm,axis=1)*np.linalg.norm(qv)+1e-12)
        selected = [int(sim_q.argmax())]
        cand = set(range(sm.shape[0])) - set(selected)
        while len(selected) < min(k, sm.shape[0]) and cand:
            best, best_c = -1e9, None
            for c in cand:
                red = max((sm[c] @ sm[s])/(np.linalg.norm(sm[c])*np.linalg.norm(sm[s])+1e-12) for s in selected)
                score = lam*sim_q[c] - (1-lam)*red
                if score > best: best, best_c = score, c
            selected.append(best_c); cand.remove(best_c)
        return selected

    keep = mmr(qv, sm, k=max_sents)
    chosen = [all_sents[i] for i in keep]
    answer = " ".join(chosen)

    # citations (unique by source, in order of selection)
    cites, used = [], set()
    for i in keep:
        src = docs[src_idx[i]]
        if src["url"] and src["url"] not in used:
            used.add(src["url"])
            cites.append((src["title"], src["url"]))
    return answer, cites

# --- UI ---
try:
    sample_user
except NameError:
    sample_user = next(iter(profiles.keys()), "")

u = W.Text(value=str(sample_user), description="User ID:", layout=W.Layout(width="50%"))
alpha = W.FloatSlider(value=0.6, min=0.0, max=1.0, step=0.05, description="α (query vs taste)", readout_format=".2f")
days = W.IntSlider(value=30, min=1, max=60, step=1, description="Recency (days)")
q = W.Text(value="AI safety and regulation news", description="Ask:")
send = W.Button(description="Send", button_style="primary")
out = W.Output()
history = []

def on_send(_):
    query = q.value.strip()
    if not query: return
    history.append(("you", query))
    with out:
        display(Markdown(f"**You:** {query}"))
    q.value = ""
    ans, cites = rag_answer_personalized(u.value.strip() or None, query, k_docs=8, max_sents=5, alpha=alpha.value, last_days=days.value)
    history.append(("bot", ans))
    with out:
        display(Markdown(f"**Assistant:** {ans}"))
        if cites:
            lines = "\n".join([f"[{i+1}] {t}\n{u}" for i,(t,u) in enumerate(cites)])
            display(Markdown(f"**Sources**\n\n{lines}"))

send.on_click(on_send)
display(W.VBox([W.HBox([u, alpha, days]), q, send, out]))


Note: you may need to restart the kernel to use updated packages.


VBox(children=(HBox(children=(Text(value='', description='User ID:', layout=Layout(width='50%')), FloatSlider(…

In [1]:
# --- One-cell Gradio chatbot (Python 3.13 safe, no audio deps) ---

# 1) install a compatible Gradio
%pip install -q "gradio==4.39.0"

# 2) stub audio libs *before* importing gradio
import sys, types
# stub audioop + pyaudioop
aud = types.ModuleType("audioop")
def _zero(*a, **k): return 0
def _bytes(*a, **k): return b""
for fn in ("rms","avg","add","mul","tostereo","tomono","getsample","lin2lin","max","reverse","minmax","avgpp"):
    setattr(aud, fn, _zero if fn!="lin2lin" else _bytes)
sys.modules["audioop"] = aud
sys.modules["pyaudioop"] = aud

# stub pydub so gradio utils won't import the real one
pydub = types.ModuleType("pydub")
class AudioSegment: pass
pydub.AudioSegment = AudioSegment
sys.modules["pydub"] = pydub
asmod = types.ModuleType("pydub.audio_segment")
asmod.AudioSegment = AudioSegment
sys.modules["pydub.audio_segment"] = asmod

# 3) import gradio *after* stubs
import gradio as gr
import pandas as pd, numpy as np, re
from sklearn.feature_extraction.text import TfidfVectorizer

# ==== assumes you already ran Steps 3–6 so these exist: meta, retrieve_personalized, one_line_summary, profiles ====
# pick a default profile if available; else run non-personalized
DEFAULT_USER = next(iter(profiles.keys()), None) if "profiles" in globals() and isinstance(profiles, dict) and profiles else None

def _summarize_from_hits(query: str, hits: pd.DataFrame, max_sents: int = 5):
    if hits.empty:
        return "I couldn't find matching articles.", []
    sents, src, titles = [], [], []
    for _, r in hits.iterrows():
        text = f"{(r['title'] or '').strip()}. {(r['abstract'] or '').strip()}".strip()
        parts = [s for s in re.split(r'(?<=[.!?])\s+', text) if len(s.split()) >= 6] or [r["title"]]
        sents += parts; src += [r["url"]] * len(parts); titles += [r["title"]] * len(parts)
    vec = TfidfVectorizer(stop_words="english", max_features=12000)
    mat = vec.fit_transform([query] + sents).toarray()
    qv, sv = mat[0], mat[1:]
    if sv.size == 0:
        return sents[0], []
    sim_q = (sv @ qv) / (np.linalg.norm(sv, axis=1) * (np.linalg.norm(qv) + 1e-12) + 1e-12)
    chosen = [int(sim_q.argmax())]
    while len(chosen) < min(max_sents, len(sents)):
        rest = [i for i in range(len(sents)) if i not in chosen]
        best, best_i = -1e9, None
        for i in rest:
            red = max((sv[i] @ sv[j])/(np.linalg.norm(sv[i])*np.linalg.norm(sv[j])+1e-12) for j in chosen)
            score = 0.72*sim_q[i] - 0.28*red
            if score > best: best, best_i = score, i
        if best_i is None: break
        chosen.append(best_i)
    answer = " ".join([sents[i] for i in chosen])
    cites, seen = [], set()
    for i in chosen:
        u = src[i]
        if u and u not in seen:
            seen.add(u)
            title = titles[i]
            cites.append((title, u))
    return answer, cites

def chat_reply(message, history, alpha, k, last_days):
    try:
        user_id = DEFAULT_USER  # fixed profile (or None -> guest)
        hits = retrieve_personalized(message, user_id, k=int(k), alpha=float(alpha), last_days=int(last_days))
        answer, cites = _summarize_from_hits(message, hits, max_sents=5)
        who = user_id if user_id else "guest"
        if not cites:
            return f"_profile: {who}_\n\n{answer}"
        src_lines = "\n".join(f"[{i+1}] {t}\n{u}" for i,(t,u) in enumerate(cites))
        return f"_profile: {who}_\n\n{answer}\n\n**Sources**\n\n{src_lines}"
    except Exception as e:
        # Always return a plain string so the frontend can parse it
        return f"Sorry—backend error: {e!s}"

demo = gr.ChatInterface(
    fn=chat_reply,
    additional_inputs=[
        gr.Slider(0, 1, value=0.6, step=0.05, label="α: query vs taste"),
        gr.Slider(1, 10, value=5, step=1, label="Top K"),
        gr.Slider(1, 60, value=30, step=1, label="Recency window (days)"),
    ],
    title="🗞️ Personalized News Chatbot",
    description="Ask about topics. Answers are RAG summaries with citations; ranking blends your query with a fixed profile (or guest)."
)

demo.launch(inline=True, share=False)


Note: you may need to restart the kernel to use updated packages.
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




--------
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/pydantic/type_adapter.py", line 271, in _init_core_attrs
    self.core_schema = _getattr_no_parents(self._type, '__pydantic_core_schema__')
                       ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/pydantic/type_adapter.py", line 55, in _getattr_no_parents
    raise AttributeError(attribute)
AttributeError: __pydantic_core_schema__

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/trisharaj/Downloads/anaconda3/lib/python3.13/site-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        self.scope, self.receive, self.send
       