In [1]:
# !pip -q install --upgrade --force-reinstall --no-cache-dir \
#   "numpy==1.26.4" "scipy==1.12.0" "pandas==2.2.2" \
#   "spacy==3.7.4" "gensim==4.3.3" \
#   "langdetect==1.0.9" "unidecode==1.3.8"



In [2]:
# # 1) Remove packages that fight with numpy==1.26.4
# !pip -q uninstall -y opencv-python opencv-python-headless opencv-contrib-python tsfresh

# # 2) Align a few core utilities Colab expects to quiet warnings
# !pip -q install --upgrade --force-reinstall \
#   "requests==2.32.4" "jedi==0.19.1" "typer==0.12.5"

# # 3) Ensure our NLP stack stays pinned where spaCy is happy
# !pip -q install --upgrade --force-reinstall \
#   "numpy==1.26.4" "scipy==1.12.0" "pandas==2.2.2" \
#   "spacy==3.7.4" "gensim==4.3.3" "langdetect==1.0.9" "unidecode==1.3.8"

# import os
# print("✅ Cleaned up. Restarting runtime to load consistent binaries…")
# os.kill(os.getpid(), 9)  # Colab-safe restart


- text_clean_llm (light)
- text_clean_tm  (heavy: lowercase, rm punctuation/digits, EN stopwords only, lemmatize, ngrams)

In [3]:
# ============================================================
# Preprocess master (EN-only stopwords removal) → CSV only
# Source: /content/drive/MyDrive/webscrape_links/master_links_translated_en.csv
# Text column: text_en
# Output: /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# - text_clean_llm (light)
# - text_clean_tm  (heavy: lowercase, rm punctuation/digits, EN stopwords only, lemmatize, ngrams)
# ============================================================

# 0) Install & mount
!pip -q install spacy==3.7.4 gensim==4.3.3 unidecode==1.3.8 langdetect==1.0.9

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 1) Imports & paths
from pathlib import Path
import csv, re, unicodedata
import pandas as pd
from unidecode import unidecode
from langdetect import detect, DetectorFactory

BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_translated_en.csv"   # given
OUT_CSV    = BASE_DIR / "master_links_preprocessed.csv"

# 2) Load data (robust quoting)
master = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "text_en" in master.columns, "Expected column 'text_en' not found."

# 3) Ensure we have a 'lang' column (fr/en from original language if missing)
DetectorFactory.seed = 42
def detect_lang_en_fr(text: str) -> str:
    t = (text or "").strip()
    if not t:
        return "en"
    try:
        return "fr" if detect(t) == "fr" else "en"
    except Exception:
        return "en"

if "lang" not in master.columns:
    # Prefer existing 'language', else detect from original text if present, else from text_en
    def derive_lang(row):
        lang = (row.get("language","") or "").lower()
        if lang.startswith("fr"): return "fr"
        if lang.startswith("en"): return "en"
        raw = row.get("text","")
        return detect_lang_en_fr(raw if raw else row.get("text_en",""))
    master["lang"] = master.apply(derive_lang, axis=1)

# 4) spaCy EN pipeline (lemmatization & stopwords) — EN stopwords ONLY
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as EN_STOPWORDS

try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
nlp.add_pipe("sentencizer", first=True)

# 5) Core normalizers (applied to both variants)
BULLET_CHARS = "•●◦▪–—‒―·∙"
BULLET_REGEX = re.compile(f"[{re.escape(BULLET_CHARS)}]")

def unicode_nfkc(text: str) -> str:
    return unicodedata.normalize("NFKC", text or "")

def fix_dehyphenation(text: str) -> str:
    # collapse word-\nword -> wordword
    return re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)

def normalize_bullets(text: str) -> str:
    return BULLET_REGEX.sub("-", text)

def collapse_whitespace(text: str) -> str:
    # keep paragraph breaks: >1 blank lines -> exactly two \n
    t = re.sub(r"\r\n?", "\n", text)
    t = re.sub(r"\n{3,}", "\n\n", t)
    t = re.sub(r"[ \t]{2,}", " ", t)
    return t.strip()

def basic_clean(text: str) -> str:
    t = unicode_nfkc(text)
    t = fix_dehyphenation(t)
    t = normalize_bullets(t)
    t = collapse_whitespace(t)
    return t

# 6) Light cleaning for LLM (preserve case/punct/numbers)
def build_text_clean_llm(text: str) -> str:
    return basic_clean(text)

# 7) Heavy cleaning for topic modeling (EN stopwords only)
PUNCT_REGEX = re.compile(r"[^\w\s]")
DIGIT_REGEX = re.compile(r"\d")

# Optional domain stopwords (EN only). You can add items later:
DOMAIN_STOPWORDS_EN = set([
    # e.g., "canada","government","policy","directive","annex","appendix","montreal"
])

def tokenize_lemma_en(doc):
    out = []
    for tok in doc:
        if tok.is_space or tok.is_punct:
            continue
        lemma = tok.lemma_.strip().lower()
        if not lemma:
            continue
        # drop digits & punctuation
        if DIGIT_REGEX.search(lemma):
            continue
        lemma = PUNCT_REGEX.sub("", lemma)
        if not lemma or len(lemma) < 3:
            continue
        # EN stopwords ONLY
        if lemma in EN_STOPWORDS or lemma in DOMAIN_STOPWORDS_EN:
            continue
        out.append(lemma)
    return out

# 8) Build both variants
source_texts = master["text_en"].fillna("")

# Light
master["text_clean_llm"] = [build_text_clean_llm(t) for t in source_texts]
master["llm_char_count"] = master["text_clean_llm"].map(lambda t: str(len(t)))
master["llm_word_count"] = master["text_clean_llm"].map(lambda t: str(len(t.split())))

# Heavy – process with spaCy in batches for speed
base_cleaned = [basic_clean(t).lower() for t in source_texts]  # lower for stability
tokens_tm = []
for doc in nlp.pipe(base_cleaned, batch_size=16, n_process=1):
    tokens_tm.append(tokenize_lemma_en(doc))

# 9) Learn bigrams/trigrams across corpus and apply
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(tokens_tm, min_count=2, threshold=10.0, delimiter=b"_")
bigram  = Phraser(phrases)
trigram = Phraser(Phrases(bigram[tokens_tm], min_count=2, threshold=10.0, delimiter=b"_"))

tokens_tm_ngrams = [list(trigram[bigram[toks]]) for toks in tokens_tm]

# Serialize heavy outputs as space-joined strings (CSV-friendly)
master["text_clean_tm"]        = [" ".join(toks) for toks in tokens_tm]
master["text_clean_tm_ngrams"] = [" ".join(toks) for toks in tokens_tm_ngrams]
master["tm_char_count"]        = master["text_clean_tm"].map(lambda t: str(len(t)))
master["tm_word_count"]        = master["text_clean_tm"].map(lambda t: str(len(t.split())))
master["tm_ng_char_count"]     = master["text_clean_tm_ngrams"].map(lambda t: str(len(t)))
master["tm_ng_word_count"]     = master["text_clean_tm_ngrams"].map(lambda t: str(len(t.split())))

# 10) Save CSV (fully quoted to preserve newlines)
master.to_csv(OUT_CSV, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote: {OUT_CSV}")

print("\nPreview of added columns:\n", master[[
    "doc_id","lang","llm_word_count","tm_word_count","tm_ng_word_count"
]].head().to_string(index=False))


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m798.7/981.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m6.9 

ValueError: mount failed

- Summary Statistics

top 20 word
top 20 ngrams

In [None]:
# ============================================================
# Top-20 terms per document for BOTH TM streams:
#   • text_clean_tm          → column: top20_terms_tm
#   • text_clean_tm_ngrams   → column: top20_terms_tm_ngrams
#
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Writes: same CSV updated with two new columns
# Plots:  PNGs to:
#   /content/drive/MyDrive/webscrape_links/viz/top20_terms/tm/
#   /content/drive/MyDrive/webscrape_links/viz/top20_terms/tm_ngrams/
# ============================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
import csv, re
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# Paths
# -------------------------
BASE_DIR    = Path("/content/drive/MyDrive/webscrape_links")
MASTER_PATH = BASE_DIR / "master_links_preprocessed.csv"
VIZ_TM      = BASE_DIR / "viz" / "top20_terms" / "tm"
VIZ_TMN     = BASE_DIR / "viz" / "top20_terms" / "tm_ngrams"
VIZ_TM.mkdir(parents=True, exist_ok=True)
VIZ_TMN.mkdir(parents=True, exist_ok=True)

# -------------------------
# Load master (robust quoting)
# -------------------------
df = pd.read_csv(MASTER_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "text_clean_tm" in df.columns, "Missing column 'text_clean_tm' in master."
assert "text_clean_tm_ngrams" in df.columns, "Missing column 'text_clean_tm_ngrams' in master."

# -------------------------
# Helpers
# -------------------------
def topn(space_joined: str, n: int = 20):
    """Return (terms, counts) top-N from a space-joined token string."""
    if not space_joined:
        return [], []
    toks = [t for t in space_joined.split() if t]
    if not toks:
        return [], []
    cnt = Counter(toks).most_common(n)
    terms = [t for t, c in cnt]
    counts = [c for t, c in cnt]
    return terms, counts

def safe_fname(s: str, fallback: str, maxlen: int = 60):
    s = (s or "").strip() or fallback
    s = re.sub(r"[^A-Za-z0-9_.-]+", "_", s)
    return s[:maxlen]

# -------------------------
# Compute & store top20 for both streams + plots
# -------------------------
top20_tm_col  = []
top20_tmn_col = []
tm_plots      = []
tmn_plots     = []

for idx, row in df.iterrows():
    doc_id = row.get("doc_id") or f"row{idx}"
    fname  = safe_fname(doc_id, f"row{idx}")

    # --- Plain TM ---
    t_tm, c_tm = topn(row.get("text_clean_tm", ""), n=20)
    top20_tm_col.append("; ".join(f"{t}:{c}" for t, c in zip(t_tm, c_tm)))

    if t_tm:
        out_png_tm = VIZ_TM / f"{idx:02d}_{fname}.png"
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(t_tm)), c_tm)
        plt.xticks(range(len(t_tm)), t_tm, rotation=45, ha="right")
        plt.title(f"Top 20 terms • {doc_id} • TM (no n-grams)")
        plt.xlabel("term")
        plt.ylabel("count")
        plt.tight_layout()
        plt.savefig(out_png_tm, dpi=150)
        plt.close()
        tm_plots.append(str(out_png_tm))
    else:
        tm_plots.append("")

    # --- TM + n-grams ---
    t_tmn, c_tmn = topn(row.get("text_clean_tm_ngrams", ""), n=20)
    top20_tmn_col.append("; ".join(f"{t}:{c}" for t, c in zip(t_tmn, c_tmn)))

    if t_tmn:
        out_png_tmn = VIZ_TMN / f"{idx:02d}_{fname}.png"
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(t_tmn)), c_tmn)
        plt.xticks(range(len(t_tmn)), t_tmn, rotation=45, ha="right")
        plt.title(f"Top 20 terms • {doc_id} • TM n-grams")
        plt.xlabel("term")
        plt.ylabel("count")
        plt.tight_layout()
        plt.savefig(out_png_tmn, dpi=150)
        plt.close()
        tmn_plots.append(str(out_png_tmn))
    else:
        tmn_plots.append("")

# -------------------------
# Add columns & save master
# -------------------------
df["top20_terms_tm"]        = top20_tm_col
df["top20_terms_tm_ngrams"] = top20_tmn_col

df.to_csv(MASTER_PATH, index=False, encoding="utf-8",
          quoting=csv.QUOTE_ALL, lineterminator="\n")

print(f"[OK] Added `top20_terms_tm` and `top20_terms_tm_ngrams` → {MASTER_PATH}")
print(f"[OK] Wrote {sum(bool(p) for p in tm_plots)} TM plots → {VIZ_TM}")
print(f"[OK] Wrote {sum(bool(p) for p in tmn_plots)} TM n-gram plots → {VIZ_TMN}")

# -------------------------
# Preview the first pair inline (if any)
# -------------------------
from IPython.display import Image, display

first_tm  = next((p for p in tm_plots if p), None)
first_tmn = next((p for p in tmn_plots if p), None)

if first_tm:
    print("\nPreview: TM (no n-grams)")
    display(Image(filename=first_tm))
if first_tmn:
    print("\nPreview: TM n-grams")
    display(Image(filename=first_tmn))
if not first_tm and not first_tmn:
    print("No plots to preview (no tokens found).")


- reads your preprocessed master at /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv

scans text_clean_tm_ngrams token streams

collects bigrams and trigrams that include either data or artificial_intelligence

bigrams_before: X_data, X_artificial_intelligence

bigrams_after: data_X, artificial_intelligence_X

trigrams_before: X_Y_data, X_Y_artificial_intelligence

trigrams_middle: X_data_Y, X_artificial_intelligence_Y

trigrams_after: data_X_Y, artificial_intelligence_X_Y

prints the top 20 for each category, and

saves a tidy CSV summary ngram_contexts_data_ai.csv with columns: target, context_type, ngram, count

In [None]:
# ============================================================
# N-gram contexts around 'data' and 'artificial_intelligence'
# - Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# - Uses:   text_clean_tm_ngrams (space-joined tokens with n-grams)
# - Outputs:
#     * Prints top-20 bigrams & trigrams (before/middle/after) for each target
#     * Saves CSV: ngram_contexts_data_ai.csv with all counts
# ============================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
import csv
import pandas as pd
from collections import Counter, defaultdict

BASE_DIR    = Path("/content/drive/MyDrive/webscrape_links")
MASTER_PATH = BASE_DIR / "master_links_preprocessed.csv"
OUT_PATH    = BASE_DIR / "ngram_contexts_data_ai.csv"

# ---------- Load ----------
df = pd.read_csv(MASTER_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "text_clean_tm_ngrams" in df.columns, "Missing 'text_clean_tm_ngrams'. Run preprocessing first."

docs = [t.split() if t else [] for t in df["text_clean_tm_ngrams"].tolist()]

# ---------- Helpers ----------
def accumulate_contexts_for_target(docs_tokens, target_token, fallback_pair=None):
    """
    Collect bigram/trigram contexts around a target token.
    If target_token never appears and fallback_pair=('artificial','intelligence') is provided,
    detect adjacency and treat as the target in position i (the 'artificial' index).
    Returns a dict of Counters for:
      bigram_before, bigram_after, trigram_before, trigram_middle, trigram_after
    """
    C = {
        "bigram_before": Counter(),
        "bigram_after": Counter(),
        "trigram_before": Counter(),  # X_Y_target
        "trigram_middle": Counter(),  # X_target_Y
        "trigram_after": Counter(),   # target_X_Y
    }
    seen_target_any = False

    for toks in docs_tokens:
        n = len(toks)
        i = 0
        while i < n:
            is_target = (toks[i] == target_token)
            used_fallback_here = False

            # Fallback: treat 'artificial' + 'intelligence' as target if requested and token absent
            if not is_target and fallback_pair and i < n - 1:
                if toks[i] == fallback_pair[0] and toks[i+1] == fallback_pair[1]:
                    is_target = True
                    used_fallback_here = True  # target starts at i, spans i..i+1

            if is_target:
                seen_target_any = True

                # Bigrams
                if i > 0:
                    C["bigram_before"][f"{toks[i-1]}_{target_token}"] += 1
                if (i < n - 1) and not used_fallback_here:
                    C["bigram_after"][f"{target_token}_{toks[i+1]}"] += 1
                elif used_fallback_here and (i + 2 < n):
                    # Fallback spans two tokens; "after" neighbor is i+2
                    C["bigram_after"][f"{target_token}_{toks[i+2]}"] += 1

                # Trigrams
                # before: X_Y_target (two tokens before target)
                if i > 1:
                    C["trigram_before"][f"{toks[i-2]}_{toks[i-1]}_{target_token}"] += 1

                # middle: X_target_Y (one before, one after)
                if i > 0 and not used_fallback_here and (i < n - 1):
                    C["trigram_middle"][f"{toks[i-1]}_{target_token}_{toks[i+1]}"] += 1
                elif used_fallback_here:
                    # middle span is (i-1) + target + (i+2)
                    if i > 0 and (i + 2 < n):
                        C["trigram_middle"][f"{toks[i-1]}_{target_token}_{toks[i+2]}"] += 1

                # after: target_X_Y (two tokens after target)
                if not used_fallback_here and (i + 2 < n):
                    C["trigram_after"][f"{target_token}_{toks[i+1]}_{toks[i+2]}"] += 1
                elif used_fallback_here and (i + 3 < n):
                    C["trigram_after"][f"{target_token}_{toks[i+2]}_{toks[i+3]}"] += 1

                # Advance pointer: if fallback used (two-token target), skip extra token
                i += 2 if used_fallback_here else 1
            else:
                i += 1

    return C, seen_target_any

def print_top(counter, k=20):
    items = counter.most_common(k)
    if not items:
        print("  (none)")
    for term, cnt in items:
        print(f"  {term:50s}  {cnt}")

# ---------- Accumulate contexts ----------
data_ctx, _ = accumulate_contexts_for_target(docs, target_token="data", fallback_pair=None)
ai_ctx, seen_ai = accumulate_contexts_for_target(
    docs,
    target_token="artificial_intelligence",
    fallback_pair=("artificial", "intelligence")  # fallback if bigram wasn't formed
)

# ---------- Print top-20 for each category ----------
def show_ctx(title, ctx):
    print(f"\n=== {title}: BIGRAMS (before) ===")
    print_top(ctx["bigram_before"], 20)
    print(f"\n=== {title}: BIGRAMS (after) ===")
    print_top(ctx["bigram_after"], 20)
    print(f"\n=== {title}: TRIGRAMS (before) ===")
    print_top(ctx["trigram_before"], 20)
    print(f"\n=== {title}: TRIGRAMS (middle) ===")
    print_top(ctx["trigram_middle"], 20)
    print(f"\n=== {title}: TRIGRAMS (after) ===")
    print_top(ctx["trigram_after"], 20)

show_ctx("DATA", data_ctx)
show_ctx("ARTIFICIAL_INTELLIGENCE", ai_ctx)

# ---------- Save tidy CSV with ALL counts ----------
rows = []
for target, ctx in [("data", data_ctx), ("artificial_intelligence", ai_ctx)]:
    for ctype, counter in ctx.items():
        for ngram, cnt in counter.items():
            rows.append({
                "target": target,
                "context_type": ctype,   # bigram_before / bigram_after / trigram_before / trigram_middle / trigram_after
                "ngram": ngram,
                "count": cnt
            })

pd.DataFrame(rows, columns=["target","context_type","ngram","count"]).sort_values(
    ["target","context_type","count"], ascending=[True, True, False]
).to_csv(OUT_PATH, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")

print(f"\n[OK] Saved full context counts → {OUT_PATH}")


Input: /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv

Text source: prefers text_clean_tm_ngrams, falls back to text_clean_tm if needed

Outputs (in the same folder):

tfidf_cosine_matrix.csv — square matrix (rows/cols = doc_id)

tfidf_cosine_pairs.csv — tidy upper-triangle pairs (doc_id_1, doc_id_2, cosine), sorted desc

In [None]:
# ============================================================
# TF-IDF + Cosine similarity (pairwise) for master docs
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm_ngrams (preferred) else text_clean_tm
# Writes: tfidf_cosine_matrix.csv (NxN) and tfidf_cosine_pairs.csv (long form)
# ============================================================



from pathlib import Path
import csv
import numpy as np
import pandas as pd

# 0) Paths
BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_preprocessed.csv"
MATRIX_CSV = BASE_DIR / "tfidf_cosine_matrix.csv"
PAIRS_CSV  = BASE_DIR / "tfidf_cosine_pairs.csv"

# 1) Load master
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)

# 2) Pick source text column
SRC_COL = None
if "text_clean_tm_ngrams" in df.columns and df["text_clean_tm_ngrams"].str.strip().any():
    SRC_COL = "text_clean_tm_ngrams"
elif "text_clean_tm" in df.columns and df["text_clean_tm"].str.strip().any():
    SRC_COL = "text_clean_tm"
else:
    raise RuntimeError("No usable text column found. Expected 'text_clean_tm_ngrams' or 'text_clean_tm'.")

print(f"[INFO] Using column for TF-IDF: {SRC_COL}")

texts  = df[SRC_COL].fillna("").astype(str).tolist()
docids = df["doc_id"].fillna("").astype(str).tolist()

# 3) Build TF-IDF (prefer existing sklearn; install if missing)
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn==1.5.1
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

# Use a token pattern that keeps underscores (already space-separated tokens)
vectorizer = TfidfVectorizer(
    lowercase=False,              # already lowercased upstream
    token_pattern=r"(?u)\b\w[\w_]+\b",  # keep tokens with underscores
    ngram_range=(1,1),            # we already have phrases joined with "_"
    min_df=1                      # keep all terms
)

# 4) Fit & transform
X = vectorizer.fit_transform(texts)           # shape: (N_docs, N_terms)
if X.shape[1] == 0:
    raise RuntimeError("TF-IDF vocabulary is empty. Check that your text column has tokens.")

# 5) Cosine similarity (dense NxN)
sim = cosine_similarity(X)                    # numpy array (N x N), diagonal = 1.0

# 6) Save matrix CSV (NxN with doc_id headers)
matrix_df = pd.DataFrame(sim, index=docids, columns=docids)
matrix_df.to_csv(MATRIX_CSV, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote cosine matrix → {MATRIX_CSV}")

# 7) Save pairs CSV (upper triangle, excluding diagonal)
rows = []
N = len(docids)
for i in range(N):
    for j in range(i+1, N):
        rows.append({"doc_id_1": docids[i], "doc_id_2": docids[j], "cosine": float(sim[i, j])})

pairs_df = pd.DataFrame(rows).sort_values("cosine", ascending=False)
pairs_df.to_csv(PAIRS_CSV, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote cosine pairs → {PAIRS_CSV}")

# 8) Quick peek
print("\nTop 10 most similar pairs:")
print(pairs_df.head(10).to_string(index=False))


TF-IDF + cosine similarity using text_clean_tm (unigrams only), and saves results to their own CSVs:

tfidf_cosine_matrix_tm.csv — square matrix (rows/cols = doc_id)

tfidf_cosine_pairs_tm.csv — tidy upper-triangle pairs (doc_id_1, doc_id_2, cosine), sorted desc

In [None]:
# ============================================================
# TF-IDF + Cosine similarity using text_clean_tm (unigrams)
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm
# Writes: tfidf_cosine_matrix_tm.csv (NxN) and tfidf_cosine_pairs_tm.csv (long form)
# ============================================================


from pathlib import Path
import csv
import numpy as np
import pandas as pd

# 0) Paths
BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_preprocessed.csv"
MATRIX_CSV = BASE_DIR / "tfidf_cosine_matrix_tm.csv"
PAIRS_CSV  = BASE_DIR / "tfidf_cosine_pairs_tm.csv"

# 1) Load master
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."
assert "text_clean_tm" in df.columns, "Missing 'text_clean_tm' in master. Run preprocessing first."

texts  = df["text_clean_tm"].fillna("").astype(str).tolist()
docids = df["doc_id"].fillna("").astype(str).tolist()

# 2) Build TF-IDF (install sklearn if needed)
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    !pip -q install scikit-learn==1.5.1
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

# token pattern keeps word characters; tokens are already space-joined
vectorizer = TfidfVectorizer(
    lowercase=False,                  # already lowercased upstream
    token_pattern=r"(?u)\b\w+\b",     # unigrams; underscores uncommon in tm stream
    ngram_range=(1,1),
    min_df=1
)

# 3) Fit & transform
X = vectorizer.fit_transform(texts)           # (N_docs, N_terms)
if X.shape[1] == 0:
    raise RuntimeError("TF-IDF vocabulary is empty. Check that 'text_clean_tm' has tokens.")

# 4) Cosine similarity (dense NxN); guard against empty rows
sim = cosine_similarity(X)                    # numpy array (N x N)
sim = np.nan_to_num(sim, nan=0.0, posinf=0.0, neginf=0.0)
np.fill_diagonal(sim, 1.0)

# 5) Save matrix CSV (NxN with doc_id headers)
matrix_df = pd.DataFrame(sim, index=docids, columns=docids)
matrix_df.to_csv(MATRIX_CSV, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote cosine matrix (tm) → {MATRIX_CSV}")

# 6) Save pairs CSV (upper triangle, excluding diagonal)
rows = []
N = len(docids)
for i in range(N):
    for j in range(i+1, N):
        rows.append({"doc_id_1": docids[i], "doc_id_2": docids[j], "cosine": float(sim[i, j])})

pairs_df = pd.DataFrame(rows).sort_values("cosine", ascending=False)
pairs_df.to_csv(PAIRS_CSV, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote cosine pairs (tm) → {PAIRS_CSV}")

# 7) Quick peek
print("\nTop 10 most similar pairs (tm):")
print(pairs_df.head(10).to_string(index=False))


BM25 similarities between all documents using your text_clean_tm (unigrams). It produces:

bm25_matrix_tm.csv — symmetric BM25 similarity matrix (rows/cols = doc_id)

bm25_pairs_tm.csv — tidy list of pairs (doc_id_1, doc_id_2, bm25_raw, bm25_norm), sorted by similarity (desc)

In [None]:
# ============================================================
# BM25 (symmetric) pairwise similarities using text_clean_tm
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm (space-joined tokens)
# Writes: bm25_matrix_tm.csv (NxN) and bm25_pairs_tm.csv (long form)
# ============================================================



from pathlib import Path
import csv
import numpy as np
import pandas as pd

# 0) Paths
BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_preprocessed.csv"
MATRIX_CSV = BASE_DIR / "bm25_matrix_tm.csv"
PAIRS_CSV  = BASE_DIR / "bm25_pairs_tm.csv"

# 1) Load master
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."
assert "text_clean_tm" in df.columns, "Missing 'text_clean_tm' in master. Run preprocessing first."

# Token lists from space-joined stream
docs_tokens = [(t or "").split() for t in df["text_clean_tm"].tolist()]
docids      = df["doc_id"].fillna("").astype(str).tolist()

# 2) BM25 (install rank_bm25 if needed)
try:
    from rank_bm25 import BM25Okapi
except Exception:
    !pip -q install rank-bm25==0.2.2
    from rank_bm25 import BM25Okapi

# Build BM25 index on the corpus (unigram tokens)
bm25 = BM25Okapi(docs_tokens)  # default k1=1.5, b=0.75 (good general defaults)

N = len(docs_tokens)
# Asymmetric matrix: score(query_i, doc_j)
asym = np.zeros((N, N), dtype=float)
for i, q in enumerate(docs_tokens):
    scores = bm25.get_scores(q)  # scores for all docs vs query q
    asym[i, :] = scores

# 3) Make it symmetric for "similarity" interpretation:
#    S(i,j) = (BM25(i->j) + BM25(j->i)) / 2
sym = (asym + asym.T) / 2.0

# Diagonal: make self-similarity the max of row (or keep computed value). We'll set to row max for visibility.
for i in range(N):
    sym[i, i] = max(sym[i, :]) if N else 0.0

# 4) Save matrix CSV (NxN with doc_id headers)
matrix_df = pd.DataFrame(sym, index=docids, columns=docids)
matrix_df.to_csv(MATRIX_CSV, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote BM25 symmetric matrix (tm) → {MATRIX_CSV}")

# 5) Save pairs CSV (upper triangle, excluding diagonal), with normalized score
rows = []
# Compute min-max across off-diagonal for normalization
off_diag = [sym[i, j] for i in range(N) for j in range(N) if j > i]
min_s = float(min(off_diag)) if off_diag else 0.0
max_s = float(max(off_diag)) if off_diag else 1.0
rng   = (max_s - min_s) if (max_s > min_s) else 1.0

for i in range(N):
    for j in range(i+1, N):
        raw = float(sym[i, j])
        norm = (raw - min_s) / rng
        rows.append({"doc_id_1": docids[i], "doc_id_2": docids[j],
                     "bm25_raw": raw, "bm25_norm": norm})

pairs_df = pd.DataFrame(rows).sort_values("bm25_raw", ascending=False)
pairs_df.to_csv(PAIRS_CSV, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote BM25 pairs (tm) → {PAIRS_CSV}")

# 6) Quick peek
print("\nTop 10 most similar pairs by BM25 (tm):")
print(pairs_df.head(10).to_string(index=False))


text_clean_tm_ngrams, builds BM25 similarities, and writes:

bm25_matrix_tm_ngrams.csv

bm25_pairs_tm_ngrams.csv

In [None]:
# ============================================================
# BM25 (symmetric) pairwise similarities using text_clean_tm_ngrams
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm_ngrams (space-joined tokens w/ bigrams+trigrams)
# Writes: bm25_matrix_tm_ngrams.csv (NxN) and bm25_pairs_tm_ngrams.csv (long form)
# ============================================================


from pathlib import Path
import csv
import numpy as np
import pandas as pd

# 0) Paths
BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_preprocessed.csv"
MATRIX_CSV = BASE_DIR / "bm25_matrix_tm_ngrams.csv"
PAIRS_CSV  = BASE_DIR / "bm25_pairs_tm_ngrams.csv"

# 1) Load master
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."
assert "text_clean_tm_ngrams" in df.columns, "Missing 'text_clean_tm_ngrams' in master. Run preprocessing first."

# Token lists from space-joined n-gram stream
docs_tokens = [(t or "").split() for t in df["text_clean_tm_ngrams"].tolist()]
docids      = df["doc_id"].fillna("").astype(str).tolist()
N = len(docs_tokens)
if N == 0:
    raise RuntimeError("No documents found.")

# 2) BM25 (install rank_bm25 if needed)
try:
    from rank_bm25 import BM25Okapi
except Exception:
    !pip -q install rank-bm25==0.2.2
    from rank_bm25 import BM25Okapi

# Build BM25 index on the corpus (n-gram tokens)
bm25 = BM25Okapi(docs_tokens)  # k1=1.5, b=0.75 defaults

# Asymmetric matrix: score(query_i, doc_j)
asym = np.zeros((N, N), dtype=float)
for i, q in enumerate(docs_tokens):
    asym[i, :] = bm25.get_scores(q)

# 3) Symmetric similarity: S(i,j) = (BM25(i→j) + BM25(j→i)) / 2
sym = (asym + asym.T) / 2.0

# Diagonal: set to row max for readability (self-sim highest)
for i in range(N):
    sym[i, i] = float(sym[i, :].max()) if N else 0.0

# 4) Save matrix CSV (NxN with doc_id headers)
matrix_df = pd.DataFrame(sym, index=docids, columns=docids)
matrix_df.to_csv(MATRIX_CSV, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote BM25 symmetric matrix (tm_ngrams) → {MATRIX_CSV}")

# 5) Save pairs CSV (upper triangle, excluding diagonal), with normalized score
rows = []
off_diag = []
for i in range(N):
    for j in range(i+1, N):
        off_diag.append(sym[i, j])

min_s = float(min(off_diag)) if off_diag else 0.0
max_s = float(max(off_diag)) if off_diag else 1.0
rng   = (max_s - min_s) if (max_s > min_s) else 1.0

for i in range(N):
    for j in range(i+1, N):
        raw = float(sym[i, j])
        norm = (raw - min_s) / rng
        rows.append({
            "doc_id_1": docids[i],
            "doc_id_2": docids[j],
            "bm25_raw": raw,
            "bm25_norm": norm
        })

pairs_df = pd.DataFrame(rows).sort_values("bm25_raw", ascending=False)
pairs_df.to_csv(PAIRS_CSV, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote BM25 pairs (tm_ngrams) → {PAIRS_CSV}")

# 6) Quick peek
print("\nTop 10 most similar pairs by BM25 (tm_ngrams):")
print(pairs_df.head(10).to_string(index=False))


semantic pairwise similarity with all-mpnet-base-v2 (and gracefully falls back to all-MiniLM-L6-v2 if needed). It:

reads /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv

uses text_clean_llm (best for semantics; falls back to text_en / text)

chunks long docs by token count (overlapped), embeds each chunk, then weighted-averages per doc

normalizes embeddings and computes a cosine similarity matrix

writes:

semantic_sim_matrix_mpnet.csv (NxN, rows/cols = doc_id)
semantic_sim_pairs_mpnet.csv (upper-triangle pairs, sorted desc)

In [None]:
# Clean out conflicting installs, then pin a compatible set and restart.
!pip -q uninstall -y transformers tokenizers sentence-transformers huggingface-hub accelerate peft bitsandbytes optimum



In [None]:

!pip -q install --no-cache-dir \
  "transformers" \
  "tokenizers" \
  "huggingface-hub" \
  "accelerate" \
  "sentence-transformers" \
  "scikit-learn"

import os
print("✅ HF stack aligned. Restarting runtime to load clean binaries…")
os.kill(os.getpid(), 9)  # Colab-safe restart

In [None]:
# ============================================================
# Semantic similarity (pairwise) with all-MiniLM-L6-v2
# Input:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Text:   text_clean_llm (fallback -> text_en -> text)
# Output: semantic_sim_matrix_minilm.csv  (NxN cosine)
#         semantic_sim_pairs_minilm.csv   (tidy pairs)
# ============================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
import csv, re
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer
from huggingface_hub import snapshot_download

# ---- Paths ----
BASE_DIR   = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH    = BASE_DIR / "master_links_preprocessed.csv"
OUT_MAT    = BASE_DIR / "semantic_sim_matrix_minilm.csv"
OUT_PAIRS  = BASE_DIR / "semantic_sim_pairs_minilm.csv"
CACHE_DIR  = Path("/content/hf_models"); CACHE_DIR.mkdir(parents=True, exist_ok=True)

# ---- Load data ----
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."
if "text_clean_llm" in df.columns and df["text_clean_llm"].str.strip().any():
    SRC_COL = "text_clean_llm"
elif "text_en" in df.columns and df["text_en"].str.strip().any():
    SRC_COL = "text_en"
elif "text" in df.columns and df["text"].str.strip().any():
    SRC_COL = "text"
else:
    raise RuntimeError("No usable text column found (need one of: text_clean_llm, text_en, text).")

texts  = df[SRC_COL].fillna("").astype(str).tolist()
docids = df["doc_id"].fillna("").astype(str).tolist()
N = len(texts)
print(f"[INFO] Using column for semantics: {SRC_COL} (docs={N})")

# ---- Load model: all-MiniLM-L6-v2 (direct → HF local fallback) ----
device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo = "sentence-transformers/all-MiniLM-L6-v2"

def load_minilm(repo_id: str):
    try:
        print(f"[INFO] Loading model directly: {repo_id} on {device}")
        return SentenceTransformer(repo_id, device=device), "direct"
    except Exception as e:
        print(f"[WARN] Direct load failed: {e}")
        print(f"[INFO] Downloading HF repo → local path…")
        local_dir = snapshot_download(
            repo_id=repo_id,
            local_dir=str(CACHE_DIR / repo_id.replace("/", "_")),
            local_dir_use_symlinks=False
        )
        print(f"[INFO] Loading from local path: {local_dir}")
        return SentenceTransformer(local_dir, device=device), "local"

model, load_mode = load_minilm(model_repo)
print(f"[OK] Model ready ({load_mode}).")

tokenizer = model.tokenizer
max_model_tokens = getattr(model, "max_seq_length", 512) or 512
MAX_TOKENS = min(420, max_model_tokens - 32)   # content tokens per chunk
STRIDE     = max(40, MAX_TOKENS // 6)          # overlap tokens
BATCH_SIZE = 8 if device == "cpu" else 24

# ---- Token-aware chunking ----
import re
def count_tokens(text: str) -> int:
    if not text: return 0
    return len(tokenizer.encode(text, add_special_tokens=False))

def chunk_by_tokens(text: str, max_tokens: int = MAX_TOKENS, stride: int = STRIDE):
    text = (text or "").strip()
    if not text:
        return []
    paras = re.split(r"\n{2,}", text)
    chunks, buf, buf_tok = [], "", 0
    def flush():
        nonlocal buf, buf_tok
        if buf:
            chunks.append((buf.strip(), buf_tok))
            buf, buf_tok = "", 0
    for p in paras:
        p = p.strip()
        if not p: continue
        ptoks = count_tokens(p)
        if ptoks <= max_tokens:
            if buf_tok + ptoks + 2 <= max_tokens:
                buf = buf + ("\n\n" if buf else "") + p
                buf_tok += ptoks + (2 if buf else 0)
            else:
                flush(); buf, buf_tok = p, ptoks
        else:
            sents = re.split(r"(?<=[\.\!\?\:;])\s+", p)
            for s in sents:
                s = s.strip()
                if not s: continue
                stoks = count_tokens(s)
                if stoks > max_tokens:
                    words = s.split()
                    cur, cur_tok = [], 0
                    for w in words:
                        wt = count_tokens(w + " ")
                        if cur_tok + wt > max_tokens:
                            chunks.append((" ".join(cur), cur_tok))
                            if STRIDE > 0 and cur:
                                tail = " ".join(cur[-max(1, len(cur)//4):])
                                chunks.append((tail, count_tokens(tail)))
                            cur, cur_tok = [w], wt
                        else:
                            cur.append(w); cur_tok += wt
                    if cur:
                        chunks.append((" ".join(cur), cur_tok))
                else:
                    if buf_tok + stoks + 1 <= max_tokens:
                        buf = (buf + " " + s).strip(); buf_tok += stoks + 1
                    else:
                        flush(); buf, buf_tok = s, stoks
            flush()
    flush()
    # light overlap tails
    if STRIDE > 0 and len(chunks) > 1:
        with_overlap = []
        for i, (txt_i, tok_i) in enumerate(chunks):
            with_overlap.append((txt_i, tok_i))
            if i < len(chunks) - 1:
                tail_words = txt_i.split()[-min(100, len(txt_i.split())):]
                tail_text = " ".join(tail_words)
                tail_tok = count_tokens(tail_text)
                if tail_tok > 0:
                    with_overlap.append((tail_text, tail_tok))
        chunks = with_overlap
    # dedupe small overlaps
    deduped, seen = [], set()
    for t, k in chunks:
        key = (t[:120], k)
        if key in seen:
            continue
        seen.add(key)
        deduped.append((t, k))
    return deduped

# ---- Embed a document (weighted mean of chunk embeddings; L2-normalized) ----
def embed_document(text: str, batch_size: int = BATCH_SIZE):
    chs = chunk_by_tokens(text, MAX_TOKENS, STRIDE)
    if not chs:
        return None
    ctexts  = [t for t, _ in chs]
    weights = np.array([max(1, k) for _, k in chs], dtype=np.float32)
    embs = model.encode(
        ctexts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    ).astype(np.float32)
    num = (embs * weights[:, None]).sum(axis=0)
    den = weights.sum()
    vec = num / max(1e-8, den)
    vec = vec / max(1e-8, np.linalg.norm(vec))
    return vec

# ---- Embed all docs ----
doc_vecs, missing = [], 0
for i, txt in enumerate(texts, 1):
    if not txt.strip():
        doc_vecs.append(None); missing += 1
        print(f"[WARN] Empty text for doc {docids[i-1]}")
        continue
    doc_vecs.append(embed_document(txt))
    if i % 3 == 0 or i == N:
        print(f"[INFO] Embedded {i}/{N}")

# Fill empties with zeros to keep shape
dim = next((v.shape[0] for v in doc_vecs if v is not None), None)
if dim is None:
    raise RuntimeError("All embeddings are empty.")
for i, v in enumerate(doc_vecs):
    if v is None:
        doc_vecs[i] = np.zeros(dim, dtype=np.float32)

E = np.vstack(doc_vecs).astype(np.float32)  # (N, D), row-normalized

# Cosine similarity as dot product
sim = (E @ E.T).astype(np.float32)
sim = np.clip(sim, -1.0, 1.0)
np.fill_diagonal(sim, 1.0)

# ---- Save outputs ----
matrix_df = pd.DataFrame(sim, index=docids, columns=docids)
matrix_df.to_csv(OUT_MAT, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote semantic cosine matrix (all-MiniLM-L6-v2) → {OUT_MAT}")

rows = []
for i in range(N):
    for j in range(i+1, N):
        rows.append({"doc_id_1": docids[i], "doc_id_2": docids[j], "cosine": float(sim[i, j])})
pairs_df = pd.DataFrame(rows).sort_values("cosine", ascending=False)
pairs_df.to_csv(OUT_PAIRS, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Wrote semantic cosine pairs (all-MiniLM-L6-v2) → {OUT_PAIRS}")

print("\nTop 10 most similar pairs (semantic):")
print(pairs_df.head(10).to_string(index=False))


In [None]:
# Heatmap plot for whichever similarity matrix exists.
# Priority: semantic → TF-IDF → BM25

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path("/content/drive/MyDrive/webscrape_links")
candidates = [
    ("Semantic (MiniLM)", BASE / "semantic_sim_matrix_minilm.csv"),
    ("Semantic (MPNet)",  BASE / "semantic_sim_matrix_mpnet.csv"),
    ("TF-IDF (tm)",       BASE / "tfidf_cosine_matrix_tm.csv"),
    ("TF-IDF (ngrams)",   BASE / "tfidf_cosine_matrix.csv"),
    ("BM25 (tm)",         BASE / "bm25_matrix_tm.csv"),
    ("BM25 (tm_ngrams)",  BASE / "bm25_matrix_tm_ngrams.csv"),
]

chosen_title, df_mat = None, None
for title, path in candidates:
    if path.exists():
        df_mat = pd.read_csv(path, index_col=0)
        chosen_title = title
        break

if df_mat is None:
    raise FileNotFoundError("No similarity matrices found among expected files. "
                            "Please generate one (semantic/TF-IDF/BM25) first.")

# Plot heatmap (matplotlib only, no custom colors)
plt.figure(figsize=(max(10, 0.6 * df_mat.shape[1]), max(8, 0.6 * df_mat.shape[0])))
im = plt.imshow(df_mat.values, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
plt.xticks(range(df_mat.shape[1]), df_mat.columns, rotation=90)
plt.yticks(range(df_mat.shape[0]), df_mat.index)
plt.colorbar(im, fraction=0.046, pad=0.04, label="Similarity")
plt.title(f"Similarity Heatmap • {chosen_title}")
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# Make individual heatmaps for all available outputs:
#   - Semantic (MiniLM, MPNet)
#   - TF-IDF (tm, ngrams)
#   - BM25 (tm, tm_ngrams)
# Saves PNGs to: /content/drive/MyDrive/webscrape_links/viz/heatmaps/
# Also displays each heatmap inline (one figure per chart).
# ============================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE = Path("/content/drive/MyDrive/webscrape_links")
OUTDIR = BASE / "viz" / "heatmaps"
OUTDIR.mkdir(parents=True, exist_ok=True)

# ---- Helper: reconstruct a symmetric matrix from pairs CSV ----
def matrix_from_pairs(pairs_path: Path, doc_id_cols=("doc_id_1","doc_id_2"), value_col="cosine"):
    dfp = pd.read_csv(pairs_path, dtype=str)
    # ensure numeric
    dfp[value_col] = pd.to_numeric(dfp[value_col], errors="coerce").fillna(0.0)
    doc_ids = sorted(set(dfp[doc_id_cols[0]]).union(set(dfp[doc_id_cols[1]])))
    idx = {d:i for i,d in enumerate(doc_ids)}
    N = len(doc_ids)
    M = np.eye(N, dtype=np.float32)
    for _, r in dfp.iterrows():
        i = idx[r[doc_id_cols[0]]]; j = idx[r[doc_id_cols[1]]]
        v = float(r[value_col])
        M[i, j] = v
        M[j, i] = v
    return pd.DataFrame(M, index=doc_ids, columns=doc_ids)

# ---- Helper: normalize a matrix to [0,1] using off-diagonal min/max ----
def minmax_offdiag(df_mat: pd.DataFrame):
    arr = df_mat.values.astype(float).copy()
    n = arr.shape[0]
    off = [arr[i, j] for i in range(n) for j in range(n) if i != j]
    if not off:
        return df_mat
    mn, mx = float(min(off)), float(max(off))
    rng = (mx - mn) if mx > mn else 1.0
    for i in range(n):
        for j in range(n):
            if i != j:
                arr[i, j] = (arr[i, j] - mn) / rng
            else:
                arr[i, j] = 1.0
    return pd.DataFrame(arr, index=df_mat.index, columns=df_mat.columns)

# ---- Spec for each output we’ll try to plot ----
# Each entry: (title, kind, matrix_csv, pairs_csv, pairs_value_col)
# kind: 'cosine' (fixed [0,1] scale) or 'bm25' (normalize)
specs = [
    ("Semantic Similarity • MiniLM", "cosine",
     BASE / "semantic_sim_matrix_minilm.csv",
     BASE / "semantic_sim_pairs_minilm.csv", "cosine"),
    ("Semantic Similarity • MPNet", "cosine",
     BASE / "semantic_sim_matrix_mpnet.csv",
     BASE / "semantic_sim_pairs_mpnet.csv", "cosine"),
    ("TF-IDF Cosine • tm (unigrams)", "cosine",
     BASE / "tfidf_cosine_matrix_tm.csv",
     BASE / "tfidf_cosine_pairs_tm.csv", "cosine"),
    ("TF-IDF Cosine • n-grams", "cosine",
     BASE / "tfidf_cosine_matrix.csv",
     BASE / "tfidf_cosine_pairs.csv", "cosine"),
    ("BM25 • tm (normalized for plot)", "bm25",
     BASE / "bm25_matrix_tm.csv",
     BASE / "bm25_pairs_tm.csv", "bm25_norm"),
    ("BM25 • tm_n-grams (normalized for plot)", "bm25",
     BASE / "bm25_matrix_tm_ngrams.csv",
     BASE / "bm25_pairs_tm_ngrams.csv", "bm25_norm"),
]

made_any = False
for title, kind, mpath, ppath, pcol in specs:
    try:
        df_mat = None
        # Prefer matrix CSV if present
        if mpath.exists():
            df_mat = pd.read_csv(mpath, index_col=0)
            # For BM25 raw matrices, normalize for visual comparability
            if kind == "bm25":
                df_mat = minmax_offdiag(df_mat)
        # Else try reconstructing from pairs
        elif ppath.exists():
            df_mat = matrix_from_pairs(ppath, value_col=pcol)
            if kind == "bm25" and pcol != "bm25_norm":
                df_mat = minmax_offdiag(df_mat)
        else:
            # skip if neither exists
            continue

        # Plot single-chart heatmap (no custom colors)
        plt.figure(figsize=(max(10, 0.6 * df_mat.shape[1]), max(8, 0.6 * df_mat.shape[0])))
        if kind == "cosine":
            im = plt.imshow(df_mat.values, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
            cbar_label = "Cosine similarity"
        else:
            im = plt.imshow(df_mat.values, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
            cbar_label = "Similarity (normalized)"
        plt.xticks(range(df_mat.shape[1]), df_mat.columns, rotation=90)
        plt.yticks(range(df_mat.shape[0]), df_mat.index)
        plt.colorbar(im, fraction=0.046, pad=0.04, label=cbar_label)
        plt.title(title)
        plt.tight_layout()

        # Save PNG
        out_png = OUTDIR / (mpath.stem + ".png" if mpath.exists() else ppath.stem + ".png")
        plt.savefig(out_png, dpi=150)
        plt.show()   # one figure per chart
        print(f"[OK] Saved heatmap → {out_png}")
        made_any = True
    except Exception as e:
        print(f"[WARN] Skipped '{title}': {e}")

if not made_any:
    print("No expected similarity outputs were found. Generate matrices/pairs first (semantic / TF-IDF / BM25).")


LDA topic modeling across your corpus (using your preprocessed tokens), then gives you per-document topics and topic summaries. It also saves a doc–topic heatmap.

Input: /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv

Text used: prefers text_clean_tm_ngrams (phrases), falls back to text_clean_tm

Auto-selects number of topics K by trying a small range and picking the best perplexity

Outputs (all in /content/drive/MyDrive/webscrape_links/topics/):

lda_topics_summary.csv — one row per topic with top terms

lda_doc_topics.csv — one row per document with dominant topic + top3

lda_doc_topic_matrix.csv — full doc×topic probabilities

viz/lda_doc_topic_heatmap.png — heatmap of doc×topic (also shown inline)

# ============================================================
# LDA Topic Modeling (per-document assignments + topic summaries)
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm_ngrams (preferred) or text_clean_tm
# Writes: /content/drive/MyDrive/webscrape_links/topics/
#         - lda_topics_summary.csv
#         - lda_doc_topics.csv
#         - lda_doc_topic_matrix.csv
#         - viz/lda_doc_topic_heatmap.png (shown inline)
# ============================================================


In [None]:
# ============================================================
# LDA Topic Modeling (per-document assignments + topic summaries)
# Reads:  /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Uses:   text_clean_tm_ngrams (preferred) or text_clean_tm
# Writes: /content/drive/MyDrive/webscrape_links/topics/
#         - lda_topics_summary.csv
#         - lda_doc_topics.csv
#         - lda_doc_topic_matrix.csv
#         - viz/lda_doc_topic_heatmap.png (shown inline)
# ============================================================


from pathlib import Path
import csv, math, numpy as np, pandas as pd
import matplotlib.pyplot as plt

# sklearn
try:
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
except Exception:
    !pip -q install scikit-learn==1.5.1
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation

# -------------------------
# Paths
# -------------------------
BASE = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH = BASE / "master_links_preprocessed.csv"
OUT_DIR = BASE / "topics"
VIZ_DIR = OUT_DIR / "viz"
OUT_DIR.mkdir(parents=True, exist_ok=True)
VIZ_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------
# Load data
# -------------------------
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."

if "text_clean_tm_ngrams" in df.columns and df["text_clean_tm_ngrams"].str.strip().any():
    SRC_COL = "text_clean_tm_ngrams"
    token_pattern = r"(?u)\b\w[\w_]+\b"  # keep underscores for phrases
else:
    assert "text_clean_tm" in df.columns, "Missing 'text_clean_tm' and 'text_clean_tm_ngrams'."
    SRC_COL = "text_clean_tm"
    token_pattern = r"(?u)\b\w+\b"

texts = df[SRC_COL].fillna("").astype(str).tolist()
docids = df["doc_id"].fillna("").astype(str).tolist()
N = len(texts)
print(f"[INFO] Source column: {SRC_COL} | docs={N}")

# -------------------------
# Vectorize (bag of words)
# -------------------------
# Documents are already cleaned; no lowercase/stopwords needed here.
vectorizer = CountVectorizer(
    lowercase=False,
    token_pattern=token_pattern,
    min_df=1,
    max_df=0.95,
)
X = vectorizer.fit_transform(texts)   # shape: (N_docs, Vocab)
vocab = np.array(vectorizer.get_feature_names_out())
print(f"[INFO] Vocab size: {len(vocab)}")

# Guard: if vocab is empty, bail gracefully
if X.shape[1] == 0:
    raise RuntimeError("Vectorizer produced an empty vocabulary. Check your preprocessed text columns.")

# -------------------------
# Choose K via quick perplexity search
# -------------------------
def choose_k(X, k_candidates):
    best_k, best_pp = None, float("inf")
    for k in k_candidates:
        lda = LatentDirichletAllocation(
            n_components=k,
            learning_method="batch",
            max_iter=20,
            random_state=42,
            evaluate_every=-1,
            doc_topic_prior=None,    # default 1/k
            topic_word_prior=None,   # default 1/k
        )
        lda.fit(X)
        # Note: sklearn's perplexity: lower is better
        pp = lda.perplexity(X)
        print(f"  K={k:<2} perplexity={pp:.2f}")
        if pp < best_pp:
            best_k, best_pp = k, pp
    return best_k, best_pp

if N <= 6:
    k_cands = list(range(2, max(3, N)))     # tiny corpora
else:
    k_cands = list(range(3, min(10, N) + 1))

print("[INFO] Selecting K (topics) via perplexity…")
best_k, best_pp = choose_k(X, k_cands)
print(f"[INFO] Selected K={best_k} (perplexity={best_pp:.2f})")

# -------------------------
# Final LDA fit (more iterations)
# -------------------------
lda = LatentDirichletAllocation(
    n_components=best_k,
    learning_method="batch",
    max_iter=100,
    random_state=42,
    evaluate_every=-1,
)
lda.fit(X)

# Topic-word distributions (beta), Doc-topic distributions (theta)
topic_word = lda.components_                  # shape: (K, V)
# Normalize rows to probabilities
topic_word = topic_word / topic_word.sum(axis=1, keepdims=True)
doc_topic = lda.transform(X)                  # shape: (N, K)
# Handle any NaNs/zeros (e.g., empty docs)
doc_topic = np.nan_to_num(doc_topic, nan=0.0, posinf=0.0, neginf=0.0)
row_sums = doc_topic.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1.0
doc_topic = doc_topic / row_sums

# -------------------------
# Summaries
# -------------------------
def top_terms_for_topic(topic_idx, topn=15):
    weights = topic_word[topic_idx]
    top_idx = np.argsort(weights)[::-1][:topn]
    terms = vocab[top_idx]
    probs = weights[top_idx]
    return list(zip(terms, probs))

# Topic summary table
rows = []
for t in range(best_k):
    pairs = top_terms_for_topic(t, topn=15)
    top_terms = ", ".join([w for w, _ in pairs])
    rows.append({
        "topic_id": t,
        "top_terms": top_terms,
        "top_terms_with_weights": "; ".join([f"{w}:{p:.4f}" for w, p in pairs])
    })
topics_summary = pd.DataFrame(rows)

# Per-document assignments
doc_rows = []
for i, doc_id in enumerate(docids):
    dist = doc_topic[i]
    top_order = np.argsort(dist)[::-1]
    dom = int(top_order[0])
    dom_prob = float(dist[dom])
    top3 = [(int(t), float(dist[t])) for t in top_order[:3]]
    doc_rows.append({
        "doc_id": doc_id,
        "dominant_topic": dom,
        "dominant_prob": f"{dom_prob:.4f}",
        "top3_topics": "; ".join([f"{t}:{p:.4f}" for t, p in top3])
    })
doc_topics = pd.DataFrame(doc_rows)

# Full doc-topic matrix with labels
col_names = [f"topic_{t}" for t in range(best_k)]
doc_topic_df = pd.DataFrame(doc_topic, columns=col_names, index=docids).reset_index().rename(columns={"index":"doc_id"})

# -------------------------
# Save outputs
# -------------------------
topics_summary.to_csv(OUT_DIR / "lda_topics_summary.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
doc_topics.to_csv(OUT_DIR / "lda_doc_topics.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
doc_topic_df.to_csv(OUT_DIR / "lda_doc_topic_matrix.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")

print(f"[OK] Saved topics → {OUT_DIR / 'lda_topics_summary.csv'}")
print(f"[OK] Saved per-doc topics → {OUT_DIR / 'lda_doc_topics.csv'}")
print(f"[OK] Saved doc-topic matrix → {OUT_DIR / 'lda_doc_topic_matrix.csv'}")

# -------------------------
# Heatmap (doc × topic)
# -------------------------
plt.figure(figsize=(max(10, 0.6 * best_k), max(8, 0.6 * N)))
im = plt.imshow(doc_topic, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
plt.xticks(range(best_k), [f"T{t}" for t in range(best_k)])
plt.yticks(range(N), docids)
plt.colorbar(im, fraction=0.046, pad=0.04, label="Topic probability")
plt.title(f"LDA Doc–Topic Heatmap (K={best_k})")
plt.tight_layout()
out_png = VIZ_DIR / "lda_doc_topic_heatmap.png"
plt.savefig(out_png, dpi=150)
plt.show()

print(f"[OK] Heatmap saved → {out_png}")


per-topic & per-document outputs and produces two matplotlib heatmaps:

Doc × Topic probabilities (lda-like)

Topic × Topic cosine similarity (via c-TF-IDF)

In [None]:
# ============================================================
# BERTopic (robust, no Transformers): TF-IDF -> SVD embeddings
# Corpus: /content/drive/MyDrive/webscrape_links/master_links_preprocessed.csv
# Text:   prefers text_clean_tm_ngrams, else text_clean_tm
# Outputs:
#   CSVs → /content/drive/MyDrive/webscrape_links/topics/
#     - bertopic_topic_info.csv
#     - bertopic_doc_topics.csv
#     - bertopic_doc_topic_matrix.csv
#   Heatmaps (PNG) → /content/drive/MyDrive/webscrape_links/topics/viz/
#     - bertopic_doc_topic_heatmap.png
#     - bertopic_topic_similarity_heatmap.png
# ============================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Clean, minimal deps (no transformers/sentence-transformers headaches)
!pip -q install -U "bertopic==0.16.3" "umap-learn==0.5.6" "hdbscan==0.8.33" "scikit-learn==1.5.1"

from pathlib import Path
import csv, numpy as np, pandas as pd, matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

import umap
import hdbscan
from bertopic import BERTopic

# -------------------------
# Paths
# -------------------------
BASE    = Path("/content/drive/MyDrive/webscrape_links")
IN_PATH = BASE / "master_links_preprocessed.csv"
OUT_DIR = BASE / "topics"
VIZ_DIR = OUT_DIR / "viz"
OUT_DIR.mkdir(parents=True, exist_ok=True)
VIZ_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------
# Load data
# -------------------------
df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=False, quoting=csv.QUOTE_ALL)
assert "doc_id" in df.columns, "Missing 'doc_id' in master."

if "text_clean_tm_ngrams" in df.columns and df["text_clean_tm_ngrams"].str.strip().any():
    SRC_COL = "text_clean_tm_ngrams"
    token_pattern = r"(?u)\b\w[\w_]+\b"  # keep n-grams with underscores
else:
    assert "text_clean_tm" in df.columns, "Need 'text_clean_tm_ngrams' or 'text_clean_tm'."
    SRC_COL = "text_clean_tm"
    token_pattern = r"(?u)\b\w+\b"

docs   = df[SRC_COL].fillna("").astype(str).tolist()
docids = df["doc_id"].fillna("").astype(str).tolist()
N = len(docs)
print(f"[INFO] Using column: {SRC_COL} | docs={N}")

# -------------------------
# TF-IDF -> SVD Embeddings (dense, low-dim)
# -------------------------
vectorizer = TfidfVectorizer(
    lowercase=False,
    token_pattern=token_pattern,
    min_df=1,
    max_df=0.95,
)
X = vectorizer.fit_transform(docs)   # (N_docs, V)
if X.shape[1] == 0:
    raise RuntimeError("Empty vocabulary from TF-IDF. Check preprocessing.")

svd_dim = int(min(300, max(2, min(X.shape[0] - 1, X.shape[1] - 1))))
svd = TruncatedSVD(n_components=svd_dim, random_state=42)
emb = svd.fit_transform(X)           # (N_docs, svd_dim)
emb = normalize(emb)                 # L2 normalize
print(f"[INFO] Built SVD embeddings: shape={emb.shape}")

# -------------------------
# BERTopic with explicit UMAP/HDBSCAN, no topic reduction (avoids KeyError)
# -------------------------
umap_model = umap.UMAP(
    n_neighbors=min(15, max(2, N-1)),
    n_components=min(10, svd_dim),
    metric="cosine",
    random_state=42,
    low_memory=True,
    verbose=False,
)
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=2,
    min_samples=1,
    prediction_data=True,   # needed for probabilities
    gen_min_span_tree=False
)

vectorizer_model = TfidfVectorizer(
    lowercase=False,
    token_pattern=token_pattern,
    min_df=1,
    max_df=0.95,
)

def run_bertopic(calc_probs: bool = True):
    model = BERTopic(
        language="english",
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        min_topic_size=2,
        nr_topics=None,                # <-- disable 'auto' reduction to avoid KeyError
        calculate_probabilities=calc_probs,
        verbose=True,
    )
    topics, probs = model.fit_transform(docs, embeddings=emb)
    return model, topics, probs

try:
    topic_model, topics, probs = run_bertopic(calc_probs=True)
except KeyError as e:
    print(f"[WARN] Probabilities path raised {e}. Retrying without probabilities...")
    topic_model, topics, probs = run_bertopic(calc_probs=False)
    probs = None

topic_info = topic_model.get_topic_info()
topic_info.to_csv(OUT_DIR / "bertopic_topic_info.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Saved topic info → {OUT_DIR / 'bertopic_topic_info.csv'}")

# -------------------------
# Per-document topic assignments
# -------------------------
rows = []
for i, doc_id in enumerate(docids):
    t = int(topics[i])  # -1 = outlier
    p = ""
    if probs is not None and t >= 0 and t < probs.shape[1]:
        p = f"{float(probs[i, t]):.4f}"
    rows.append({"doc_id": doc_id, "topic": t, "topic_probability": p})
doc_topics = pd.DataFrame(rows)
doc_topics.to_csv(OUT_DIR / "bertopic_doc_topics.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Saved doc topics → {OUT_DIR / 'bertopic_doc_topics.csv'}")

# -------------------------
# Doc × Topic probability matrix (+ fallback if probs=None)
# -------------------------
if probs is not None:
    n_topics = probs.shape[1]
    col_names = [f"T{t}" for t in range(n_topics)]
    doc_topic_df = pd.DataFrame(probs, columns=col_names, index=docids).reset_index().rename(columns={"index":"doc_id"})
else:
    # Fallback: one-hot memberships (1 at assigned topic, 0 else), include -1 column if present
    valid = sorted(set(topics))
    topic_to_col = {t: f"T{t}" for t in valid}
    mat = np.zeros((N, len(valid)), dtype=float)
    for i, t in enumerate(topics):
        j = valid.index(t)
        mat[i, j] = 1.0
    doc_topic_df = pd.DataFrame(mat, columns=[topic_to_col[t] for t in valid], index=docids).reset_index().rename(columns={"index":"doc_id"})
    n_topics = len(valid)
doc_topic_df.to_csv(OUT_DIR / "bertopic_doc_topic_matrix.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL, lineterminator="\n")
print(f"[OK] Saved doc-topic matrix → {OUT_DIR / 'bertopic_doc_topic_matrix.csv'}")

# -------------------------
# Heatmap 1: Doc × Topic
# -------------------------
M = doc_topic_df.drop(columns=["doc_id"]).values
plt.figure(figsize=(max(10, 0.6 * n_topics), max(8, 0.6 * N)))
im = plt.imshow(M, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
plt.xticks(range(n_topics), doc_topic_df.columns[1:], rotation=0)
plt.yticks(range(N), docids)
plt.colorbar(im, fraction=0.046, pad=0.04, label=("Topic probability" if probs is not None else "Membership (0/1)"))
plt.title("BERTopic • Doc–Topic Heatmap")
plt.tight_layout()
out_png = VIZ_DIR / "bertopic_doc_topic_heatmap.png"
plt.savefig(out_png, dpi=150)
plt.show()
print(f"[OK] Heatmap saved → {out_png}")

# -------------------------
# Heatmap 2: Topic × Topic similarity (c-TF-IDF cosine)
# -------------------------
valid_topics = [tid for tid in topic_info["Topic"].tolist() if tid != -1]
if len(valid_topics) >= 2 and hasattr(topic_model, "c_tf_idf_"):
    ctf = topic_model.c_tf_idf_
    try:
        ctf = ctf.toarray()
    except Exception:
        ctf = np.array(ctf)

    # Align rows of cTF-IDF to valid topic ids
    matrix_order = sorted([k for k in topic_model.get_topics().keys() if k != -1])
    topic_to_row = {tid: i for i, tid in enumerate(matrix_order)}
    rows_idx = [topic_to_row[t] for t in valid_topics if t in topic_to_row]
    if rows_idx:
        ctf_sel = ctf[rows_idx, :]
        sim_tt = cosine_similarity(ctf_sel)
        plt.figure(figsize=(max(8, 0.6 * len(valid_topics)), max(8, 0.6 * len(valid_topics))))
        im2 = plt.imshow(sim_tt, aspect="auto", interpolation="nearest", vmin=0.0, vmax=1.0)
        xt = [f"T{t}" for t in valid_topics]
        plt.xticks(range(len(valid_topics)), xt, rotation=0)
        plt.yticks(range(len(valid_topics)), xt)
        plt.colorbar(im2, fraction=0.046, pad=0.04, label="Cosine similarity")
        plt.title("BERTopic • Topic–Topic Similarity (c-TF-IDF)")
        plt.tight_layout()
        out_png2 = VIZ_DIR / "bertopic_topic_similarity_heatmap.png"
        plt.savefig(out_png2, dpi=150)
        plt.show()
        print(f"[OK] Heatmap saved → {out_png2}")
    else:
        print("[WARN] Could not align topics to c-TF-IDF rows; skipping topic-topic heatmap.")
else:
    print("[WARN] Not enough valid topics or missing c_tf_idf_; skipping topic-topic heatmap.")

# -------------------------
# Quick peek at topics
# -------------------------
print("\nTop terms per topic:")
for tid in valid_topics:
    terms = topic_model.get_topic(tid) or []
    top_terms = ", ".join([w for w, _ in terms[:10]])
    print(f" T{tid}: {top_terms}")
