
# Qual — Option C (Remix)
Minimal notebook that loads **World Data 2.0** and runs a few methods:
- **word-frequency** (top unigrams/bigrams)
- **simple-sentiment** (lexicon-style polarity)
- **semantic-embeddings** (KMeans clusters on sentence embeddings; TF-IDF fallback)

> This version **does not auto-build a text column**. It requires an existing text column.


In [None]:

# =========================
# CONFIG (EDIT HERE)
# =========================
DATA_URL = "https://github.com/zachtilton/intermediate_analytics_ai/blob/main/World%20Data%202.0%20-%20Data.csv"

# Choose 1–2 methods in order, e.g.: ["word-frequency", "simple-sentiment"]
METHODS = ["word-frequency", "simple-sentiment"]

# Columns
DOC_ID_COL = "Country"   # document id
TEXT_COL = "text"        # REQUIRED existing column name (case-insensitive OK)

# Parameters
TOP_N = 30   # top-N unigrams/bigrams
K = 4        # clusters when using semantic-embeddings

print("Config set. Edit METHODS/columns above as needed.")


In [None]:

import pandas as pd
import numpy as np
import re
from collections import Counter

def to_raw_github_url(url: str) -> str:
    if "github.com" in url and "raw.githubusercontent.com" not in url:
        url = url.replace("github.com/", "raw.githubusercontent.com/")
        url = url.replace("/blob/", "/")
    return url

STOPWORDS = {
    'a','an','the','and','or','but','if','then','else','of','to','in','on','for','with',
    'is','are','was','were','be','been','being','at','by','from','as','it','this','that',
    'these','those','there','here','we','you','they','he','she','them','his','her','their',
    'i','me','my','our','ours','your','yours','us'
}

TOKEN_RE = re.compile(r"[a-zA-Z]{2,}")

def tokenize(text):
    toks = [t.lower() for t in TOKEN_RE.findall(text or '') if t.lower() not in STOPWORDS]
    return toks

def make_bigrams(tokens):
    return [f"{tokens[i]} {tokens[i+1]}" for i in range(len(tokens)-1)]

SENTIMENT = {
    'good': 2, 'great': 3, 'excellent': 4, 'positive': 2, 'benefit': 2, 'improve': 2,
    'bad': -2, 'poor': -2, 'negative': -2, 'harm': -3, 'worse': -2, 'risk': -1
}

def doc_sentiment(tokens):
    return sum(SENTIMENT.get(t, 0) for t in tokens)

pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 50)


In [None]:

RAW_URL = to_raw_github_url(DATA_URL)
df = pd.read_csv(RAW_URL)
df.columns = [c.strip() for c in df.columns]

print("Loaded shape:", df.shape)
print("Columns:")
print(list(df.columns))

print("\nPreview:")
display(df.head(3))


In [None]:

# Use existing text column only (no auto-build)
if DOC_ID_COL not in df.columns:
    raise ValueError(f"DOC_ID_COL '{DOC_ID_COL}' not found in CSV columns.")

# case-insensitive resolution of TEXT_COL
col_map = {c.lower(): c for c in df.columns}
tc = TEXT_COL if TEXT_COL in df.columns else col_map.get(TEXT_COL.lower())

if tc is None:
    raise ValueError(f"TEXT_COL '{TEXT_COL}' not found. Please set TEXT_COL to the exact column name containing text.")

docs = df[[DOC_ID_COL, tc]].rename(columns={DOC_ID_COL:"doc_id", tc:"text"}).copy()
docs = docs.dropna(subset=["text"])
docs["text"] = docs["text"].astype(str).str.strip()
docs = docs[docs["text"] != ""]

print("Docs shape:", docs.shape)
display(docs.head(5))


In [None]:

methods = [m.strip().lower() for m in METHODS]
valid = {"word-frequency", "simple-sentiment", "semantic-embeddings"}
if any(m not in valid for m in methods):
    raise ValueError(f"METHODS must be subset of {valid}")

top_unigrams = None
top_bigrams = None
doc_polarity = None
clusters_table = None
cluster_summary = None

# word-frequency
if "word-frequency" in methods:
    uni = Counter(); bi = Counter()
    for _, row in docs.iterrows():
        toks = tokenize(row["text"])
        uni.update(toks)
        bi.update(make_bigrams(toks))
    top_unigrams = pd.DataFrame(uni.most_common(TOP_N), columns=["term","freq"])
    top_bigrams = pd.DataFrame(bi.most_common(TOP_N), columns=["term","freq"])
    print("\n[WORD FREQUENCY] Top unigrams:"); display(top_unigrams.head(10))
    print("Top bigrams:"); display(top_bigrams.head(10))

# simple-sentiment
if "simple-sentiment" in methods:
    rows = []
    for _, row in docs.iterrows():
        toks = tokenize(row["text"])
        rows.append({"doc_id": row["doc_id"], "sentiment": doc_sentiment(toks)})
    doc_polarity = pd.DataFrame(rows)
    print("\n[SIMPLE SENTIMENT] (lexicon-style, illustrative)"); display(doc_polarity.head(10))
    print("Note: Very rough heuristic; interpret directionally.")

# semantic-embeddings
if "semantic-embeddings" in methods:
    texts = docs["text"].tolist()
    try:
        from sentence_transformers import SentenceTransformer
        from sklearn.cluster import KMeans
        model = SentenceTransformer("all-MiniLM-L6-v2")
        emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
        km = KMeans(n_clusters=K, n_init=10, random_state=42).fit(emb)
        labels = km.labels_
        print("\n[SEMANTIC EMBEDDINGS] Used 'all-MiniLM-L6-v2'.")
    except Exception as e:
        print("\n[SEMANTIC EMBEDDINGS] sentence-transformers unavailable; TF-IDF + KMeans fallback.")
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.cluster import KMeans
        vec = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))
        X = vec.fit_transform(texts)
        km = KMeans(n_clusters=K, n_init=10, random_state=42).fit(X)
        labels = km.labels_

    clusters_table = docs[["doc_id","text"]].copy()
    clusters_table["cluster"] = labels
    clusters_table["snippet"] = clusters_table["text"].str.slice(0, 140).str.replace("\n"," ", regex=False)

    summary_rows = []
    for c in sorted(clusters_table["cluster"].unique()):
        group = clusters_table[clusters_table["cluster"] == c]
        ex = group["snippet"].head(3).tolist()
        summary_rows.append({"cluster": int(c), "size": int(len(group)), "examples": ex})
    cluster_summary = pd.DataFrame(summary_rows).sort_values("cluster")

    print("\n[CLUSTERS] doc_id → cluster (preview):"); display(clusters_table.head(10))
    print("\n[CLUSTER SUMMARY]"); display(cluster_summary)


In [None]:

print("\n===== Compact Results Ready =====")
if top_unigrams is not None: print(f"top_unigrams shape: {top_unigrams.shape}")
if top_bigrams is not None: print(f"top_bigrams shape: {top_bigrams.shape}")
if doc_polarity is not None: print(f"doc_polarity shape: {doc_polarity.shape}")
if clusters_table is not None: print(f"clusters_table shape: {clusters_table.shape}")
if cluster_summary is not None: print(f"cluster_summary shape: {cluster_summary.shape}")

if "word-frequency" in methods and top_unigrams is not None:
    common_terms = ", ".join(top_unigrams["term"].head(5).tolist())
    print(f"\n[INTERPRETATION] Frequent terms include: {common_terms}. Consider what shared themes these reflect.")

if "simple-sentiment" in methods and doc_polarity is not None:
    avg = doc_polarity["sentiment"].mean()
    direction = "positive" if avg > 0 else "negative" if avg < 0 else "neutral"
    print(f"[INTERPRETATION] Average sentiment skews {direction} (mean ≈ {avg:.2f}); treat as directional only.")

if "semantic-embeddings" in methods and cluster_summary is not None:
    print("[INTERPRETATION] Clusters suggest semantically similar groups. Review example snippets to label each cluster.")
