# AbstractAna — no‑LLM fallback

This notebook tags arXiv abstracts **without** relying on a local LLM server. It uses a **TF‑IDF n‑gram** extractor to generate concise tags per abstract and writes them to `papers.llm_tags` in the SQLite DB.

If later set up an OpenAI‑compatible endpoint (vLLM/LM Studio/Ollama with compat), one can flip the switch to use it.


In [None]:
# --- Config ---
from pathlib import Path
DB_PATH = Path('/Users/wenzheng/Desktop/LLM CS quant/ZZW-LLM/RAGAnalyzer/arxiv.db')  # <-- update to   arxiv.db
USE_LLM = False           # set True to attempt OpenAI-compatible server; otherwise fallback TF-IDF
OPENAI_BASE_URL = 'http://127.0.0.1:8889/v1'  # change if enable an LLM server
OPENAI_MODEL = '/models/Qwen3-8B'
MAX_TAGS = 20
LIMIT = 200               # None for all
print(DB_PATH.resolve())


In [None]:
# --- Imports & helpers ---
import sqlite3, re, json
from typing import List, Tuple
import requests

def remove_think_tag(text: str) -> str:
    return re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)

def ensure_llm_tags_column(con: sqlite3.Connection):
    cur = con.cursor()
    cur.execute("PRAGMA table_info(papers)")
    cols = {row[1] for row in cur.fetchall()}
    if 'llm_tags' not in cols:
        cur.execute("ALTER TABLE papers ADD COLUMN llm_tags TEXT")
        con.commit()
        print("[init] Added llm_tags TEXT to papers")
    else:
        print("[init] llm_tags column already present")

def load_papers(con: sqlite3.Connection, limit: int | None = None) -> List[Tuple[str, str, str]]:
    cur = con.cursor()
    q = "SELECT id, title, summary FROM papers WHERE summary IS NOT NULL AND trim(summary) <> '' AND (llm_tags IS NULL OR trim(llm_tags) = '') ORDER BY published DESC"
    if limit:
        q += f" LIMIT {int(limit)}"
    cur.execute(q)
    return cur.fetchall()

def try_ping_llm(base_url: str) -> bool:
    try:
        r = requests.get(base_url.rstrip('/') + '/models', timeout=2)
        return r.ok
    except Exception:
        return False


## TF‑IDF tagger (fallback)
Extract top n‑gram phrases (1–3) per abstract using a TF‑IDF vectorizer. This is a quick, dependency‑light alternative to LLM tagging.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Simple stopword list; can be extended
STOPWORDS = set('''a an the and or but if then else for while to of in on at by from with without within upon about into over under above below between among across through during before after
is are was were be been being have has had do does did can could should would may might must will shall not no nor so than too very just also more most many much few several each every per via using use used
we our us you your they their them he she it its this that these those as such one two three four five six seven eight nine ten et al etal etc i ii iii iv v vi vii viii ix x'.split())

def normalize_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def tfidf_tags(docs: list[str], max_tags: int = 20) -> list[list[str]]:
    # Vectorize with 1-3 grams, basic token pattern; English stopwords via custom list
    vectorizer = TfidfVectorizer(
        ngram_range=(1,3),
        lowercase=True,
        token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z0-9_\-]{1,}\b",
        min_df=2,          # ignore ultra-rare terms (tune if corpus is small)
        max_df=0.9,
    )
    clean_docs = [normalize_text(d) for d in docs]
    X = vectorizer.fit_transform(clean_docs)
    vocab = np.array(vectorizer.get_feature_names_out())

    results: list[list[str]] = []
    for i in range(X.shape[0]):
        row = X.getrow(i)
        if row.nnz == 0:
            results.append([])
            continue
        idx = row.indices
        vals = row.data
        order = np.argsort(-vals)  # descending by tf-idf
        tags = []
        seen = set()
        for j in order:
            term = vocab[idx[j]]
            # Basic stopword filtering: skip if term is mostly stopwords or too short
            words = term.split()
            if all(w in STOPWORDS or len(w) <= 2 for w in words):
                continue
            key = term.lower()
            if key in seen:
                continue
            tags.append(term)
            seen.add(key)
            if len(tags) >= max_tags:
                break
        results.append(tags)
    return results


## Run tagging
If `USE_LLM=True` **and** the server responds at `OPENAI_BASE_URL`, the notebook would call the LLM. Otherwise, it falls back to TF‑IDF.


In [None]:
import sqlite3

con = sqlite3.connect(DB_PATH)
ensure_llm_tags_column(con)
rows = load_papers(con, limit=LIMIT)
print(f"Loaded {len(rows)} papers to tag")

ids = [r[0] for r in rows]
titles = [r[1] for r in rows]
abstracts = [r[2] for r in rows]

use_llm_now = False
if USE_LLM:
    use_llm_now = try_ping_llm(OPENAI_BASE_URL)
print(f"LLM available: {use_llm_now}")

if not rows:
    print("Nothing to tag.")
else:
    if use_llm_now:
        raise NotImplementedError("LLM path disabled in this fallback notebook. Set USE_LLM=False or use the cleaned LLM notebook.")
    else:
        all_tags = tfidf_tags(abstracts, max_tags=MAX_TAGS)
        cur = con.cursor()
        for pid, tags in zip(ids, all_tags):
            cur.execute("UPDATE papers SET llm_tags = ? WHERE id = ?", (", ".join(tags), pid))
        con.commit()
        print(f"[write] Wrote tags for {len(ids)} papers using TF-IDF fallback.")

con.close()


## Plot: top tags
Quick view of the most frequent tags found in `papers.llm_tags`.


In [None]:
import sqlite3, pandas as pd
import matplotlib.pyplot as plt

con = sqlite3.connect(DB_PATH)
df = pd.read_sql_query("SELECT llm_tags FROM papers WHERE llm_tags IS NOT NULL AND trim(llm_tags) <> ''", con)
con.close()

if df.empty:
    print("No tags yet. Run the tagging cell first.")
else:
    tags = (
        df['llm_tags']
        .str.split(',')
        .explode()
        .str.strip()
    )
    top = tags.value_counts().head(30)
    plt.figure()
    top.sort_values().plot(kind='barh')
    plt.title('Top tags (TF-IDF fallback)')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()
