# Cell 1 — Installs & Imports

This cell installs the necessary Python libraries for this notebook and imports all required modules.

In [None]:
import sys, subprocess

def _pip(pkg):
    """Installs packages quietly using pip."""
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + (pkg if isinstance(pkg, list) else [pkg]))

# Core libraries
_pip(["pandas", "numpy", "scikit-learn", "scipy", "tqdm", "pyarrow", "fastparquet", "joblib", "unidecode", "orjson"])
# Hugging Face libraries for model handling
_pip(["transformers>=4.50.0", "accelerate", "huggingface_hub", "bitsandbytes"])
# Library for multilabel stratification
_pip("iterative-stratification")

import os, re, gc, json, time, math, glob, random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from unidecode import unidecode
from huggingface_hub import snapshot_download, notebook_login
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, BitsAndBytesConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_recall_fscore_support, precision_recall_curve
from scipy.sparse import hstack
import joblib
import torch

# Cell 2 — Config, Mount Drive, Paths, Seeds

This cell sets up configuration parameters, mounts Google Drive, defines project paths, and sets random seeds for reproducibility.

In [None]:
from google.colab import drive

def set_seeds(seed=42):
    """Sets random seeds for reproducibility."""
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seeds(42)

# Mount Google Drive if not already mounted
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive")

# Define project directories in Google Drive
OUT_DIR   = "/content/drive/MyDrive/Tiktok_Hackathon"
CACHE_DIR = f"{OUT_DIR}/hf_cache"     # persistent HF cache
CKPT_DIR  = f"{OUT_DIR}/labels_ckpt_1k" # checkpoint directory for labeling
for d in [OUT_DIR, CACHE_DIR, CKPT_DIR]: os.makedirs(d, exist_ok=True)

# Route Hugging Face caches to Drive for persistence
os.environ["HF_HOME"] = CACHE_DIR
os.environ["HF_HUB_CACHE"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # Enable faster downloads

# Raw data paths - UPDATE these paths if your files are different
REVIEW_PATH = f"{OUT_DIR}/review-Vermont.json"
META_PATH   = f"{OUT_DIR}/meta-Vermont.json"

# Configuration for labeling and preview
N_PREVIEW = 30       # number of samples for quick preview
N_LABEL   = 1000     # target number of samples for checkpointed labeling
PART_SIZE = 100      # number of rows per checkpoint file

print("OUT_DIR:", OUT_DIR)
print("CACHE_DIR:", CACHE_DIR)

# Cell 3 — Load & prep raw data → df

This cell reads the raw JSONL review and metadata files, merges them based on `gmap_id`, removes duplicate entries, and renames key columns.

In [None]:
import json as _json

def parse_jsonl(path, limit=None):
    """Parses a JSONL file and returns a list of dictionaries."""
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if limit and i >= limit: break
            data.append(_json.loads(line))
    return pd.DataFrame(data)

# Assert that the input files exist
assert os.path.exists(REVIEW_PATH) and os.path.exists(META_PATH), "Review/meta files not found in OUT_DIR."

# Load data into DataFrames
df_reviews = parse_jsonl(REVIEW_PATH)
df_meta    = parse_jsonl(META_PATH)

# Deduplicate businesses in metadata based on 'gmap_id'
if 'gmap_id' in df_meta.columns:
    df_meta = df_meta.drop_duplicates(subset=['gmap_id'])

# Merge reviews and metadata
df = df_reviews.merge(df_meta, on='gmap_id', how='left')

# Drop exact duplicate review rows
keep_cols = [c for c in ['user_id','gmap_id','text','time','rating'] if c in df.columns]
if keep_cols:
    before = len(df)
    df = df.drop_duplicates(subset=keep_cols, keep='first')
    print(f"Dropped {before - len(df)} duplicate review rows.")

# Rename columns for clarity if present
ren = {}
if 'name_x' in df.columns: ren['name_x'] = 'reviewer_name'
if 'name_y' in df.columns: ren['name_y'] = 'business_name'
if ren:
    df = df.rename(columns=ren)

print("df shape:", df.shape)

# Cell 4 — Hugging Face login (for gated model access)

This cell provides a way to log in to the Hugging Face Hub, which might be necessary to access certain gated models like `google/gemma-3-12b-it`. You only need to run this once per session if the model is gated for your account.

In [None]:
from huggingface_hub import notebook_login

try:
    notebook_login()
except Exception as e:
    print("HF login skipped or already authenticated.")

# Cell 5 — Load Gemma‑3 12B from Drive cache (download if needed)

This cell loads the Gemma-3 12B Instruction Tuned model. It first checks for a cached version in the specified Drive cache directory (`CACHE_DIR`). If not found, it downloads the model from the Hugging Face Hub to the cache, resuming if necessary. The model is loaded with 4-bit quantization if a GPU is available.

In [None]:
# Robust Gemma-3 12B loader: copies full snapshot to Drive (no hardlinks), then loads from there.

import os, gc, torch, shutil
from huggingface_hub import snapshot_download
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, BitsAndBytesConfig

REPO_ID = "google/gemma-3-12b-it"  # gated: be logged in!
OUT_DIR = "/content/drive/MyDrive/Tiktok_Hackathon"
DRIVE_SNAPSHOT = f"{OUT_DIR}/gemma3_12b_snapshot"   # final folder with real files (not symlinks)

def _has_all_shards(folder, n=5):
    # basic presence/size check; adjust if HF changes naming
    ok = True
    sizes = []
    for i in range(1, n+1):
        p = os.path.join(folder, f"model-0000{i}-of-0000{n}.safetensors")
        ok &= os.path.exists(p)
        sizes.append((p, os.path.getsize(p) if os.path.exists(p) else 0))
    return ok, sizes

# 1) Reuse snapshot if it looks complete; else (re)download & copy here as real files
os.makedirs(OUT_DIR, exist_ok=True)
reuse = False
if os.path.isdir(DRIVE_SNAPSHOT):
    have, sizes = _has_all_shards(DRIVE_SNAPSHOT, n=5)
    reuse = have and all(sz > 1_000_000_000 for _, sz in sizes)  # each > ~1GB

if not reuse:
    # Download to the default local cache, then copy to Drive folder as plain files (no links)
    print("⬇️  Downloading Gemma-3 12B to local cache… (ensure you're logged into HF)")
    local_snap = snapshot_download(
        repo_id=REPO_ID,
        allow_patterns=["*.json","*.safetensors","*.model","*.tokenizer*","*.txt","*.md","*config*","chat_template.json"],
    )
    print("📁 Local snapshot:", local_snap)

    # Clear stale/partial Drive snapshot (if any), then copy fresh
    if os.path.isdir(DRIVE_SNAPSHOT):
        shutil.rmtree(DRIVE_SNAPSHOT)
    shutil.copytree(local_snap, DRIVE_SNAPSHOT, dirs_exist_ok=True)
    have, sizes = _has_all_shards(DRIVE_SNAPSHOT, n=5)
    assert have, f"Drive snapshot incomplete after copy: {sizes}"
    print("✅ Copied full snapshot to:", DRIVE_SNAPSHOT)

# 2) Load from Drive snapshot (fast & reliable)
quant_cfg = None
dtype = torch.float32
if torch.cuda.is_available():
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    dtype = torch.bfloat16

processor = AutoProcessor.from_pretrained(DRIVE_SNAPSHOT, trust_remote_code=True, use_fast=True)
model = Gemma3ForConditionalGeneration.from_pretrained(
    DRIVE_SNAPSHOT,
    device_map="auto" if torch.cuda.is_available() else None,
    quantization_config=quant_cfg,
    torch_dtype=dtype,
    trust_remote_code=True,
).eval()

print("🎉 Ready:", REPO_ID, "| device:", next(model.parameters()).device, "| dtype:", model.dtype)

# Cell 6 — Prompt, parser, guard (LLM labeling)

This cell defines the system rules and user template for prompting the Gemma model, along with functions to clean review text, check for low-quality reviews, build messages for the model, and parse the model's JSON output, including a guard function to handle low-quality inputs efficiently.

In [None]:
import re
import json
import orjson as _oj # Using orjson for faster JSON handling
from unidecode import unidecode
import torch

SYSTEM_RULES = ("""
You are a strict moderator for Google location reviews.
Decide ONLY from the review text plus the provided place name and category. Do not use outside knowledge.
If a visit is not clearly implied, do not assume one. Multiple violations may apply.
If uncertain, set all violations to false and set is_relevant to false.

Definitions you must apply:
- ADS (Advertisements or promos): URLs, phone numbers, coupon codes, discount percentages, phrases like call now, use code, any self or third party promotion.
- IRRELEVANT (off topic): Mostly about unrelated topics or a different place or brand with no substantive content about this place.
- NON_VISIT_RANT: Complaints without clear visit signals such as never been, I heard, did not go inside, will not go. Phone only complaint counts unless the business is inherently phone service.
- SPAM or LOW_QUALITY: Near empty or one word or emoji only, gibberish, excessive repetition, boilerplate or copy paste.

Relevancy:
- true if the text substantively describes the experience, service, staff, products, or environment of the named place or clearly matches its category.
- false if IRRELEVANT or pure SPAM. ADS can be present while still relevant.

Return JSON only in this exact schema:
{
  "is_relevant": true|false,
  "violations": ["ADS"|"IRRELEVANT"|"NON_VISIT_RANT"|"SPAM", ...],
  "rationale": "<=25 words"
}
""")

USER_TMPL = """Place:
  name: {place}
  category: {cat}
Review:
  text: {text}

Decide using the rules. Return JSON only in the schema above.
"""

GT_WRAPPER = re.compile(r'\(Translated by Google\)\s*|\(Original\)\s*', re.I)
LABELS_SET = {"ADS","IRRELEVANT","NON_VISIT_RANT","SPAM"}

def _norm(s):
    """Normalizes strings by removing extra whitespace and handling unicode."""
    s = "" if s is None else str(s)
    return re.sub(r"\s+"," ", unidecode(s).replace("\u200b"," ").strip())

def clean_review(s: str) -> str:
    """Cleans review text by removing translation wrappers and normalizing."""
    s = _norm(s)
    s = GT_WRAPPER.sub('', s)
    return s.strip()

def cheap_low_quality(s: str) -> bool:
    """Quickly checks if a review is likely low quality (spam, too short, etc.)."""
    s2 = clean_review(s or "")
    if not s2: return True
    wc = len(s2.split())
    short = wc <= 2 or len(s2) < 12
    emoji_only = bool(s2) and not re.search(r'[A-Za-z]', s2)
    repeated = bool(re.search(r'(.)\1{3,}', s2))
    return short or emoji_only or repeated

def build_messages(text, place="", cat=""):
    """Builds the conversation messages for the LLM prompt."""
    user = USER_TMPL.format(place=_norm(place), cat=_norm(cat), text=_norm(text)[:1000])
    return [
        {"role":"system","content":[{"type":"text","text":SYSTEM_RULES}]},
        {"role":"user",  "content":[{"type":"text","text":user}]},
    ]

@torch.inference_mode()
def gemma_label_one(text, place="", cat="", max_new_tokens=96):
    """Labels a single review using the Gemma model."""
    # Apply chat template and tokenize
    enc = processor.apply_chat_template(
        build_messages(text, place, cat),
        add_generation_prompt=True, tokenize=True,
        return_tensors="pt", return_dict=True
    )
    enc = {k: v.to(model.device) for k,v in enc.items()}

    # Generate response from the model
    out = model.generate(
        **enc, max_new_tokens=max_new_tokens, do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
    )
    gen = out[0][enc["input_ids"].shape[-1]:]
    txt = processor.tokenizer.decode(gen, skip_special_tokens=True)

    # Extract JSON from the model's output
    m = re.search(r"```json\s*(\{.*?\})\s*```", txt, re.S|re.I) or re.search(r"\{.*\}", txt, re.S)
    data = {}
    if m:
        raw = m.group(1) if m.re.pattern.startswith("```json") else m.group(0)
        try:
            data = _oj.loads(raw) # Use orjson for loading
        except Exception:
            try:
                data = json.loads(raw) # Fallback to standard json
            except Exception as e:
                print(f"Failed to parse JSON: {e} - Raw: {raw}")


    # Parse and validate the extracted data
    is_rel = bool(data.get("is_relevant", False))
    vs = data.get("violations", [])
    if isinstance(vs, str): vs=[vs] # Handle case where violations is a single string
    viols = sorted({v.upper().replace(" ","_") for v in vs if v})
    viols = [v for v in viols if v in LABELS_SET] # Filter for defined labels

    return {
        "is_relevant_overall": int(is_rel),
        "is_advertisement":    int("ADS" in viols),
        "is_irrelevant":       int("IRRELEVANT" in viols),
        "is_rant_without_visit": int("NON_VISIT_RANT" in viols),
        "is_spam":             int("SPAM" in viols),
        "rationale": str(data.get("rationale",""))[:200].strip(),
        "gemma_raw": txt,
        "parse_success": int(bool(data) and all(v in data for v in ["is_relevant", "violations", "rationale"])), # Check for minimal required keys
    }

def label_with_guard(text, place="", cat=""):
    """Labels a review using Gemma, with a cheap pre-check for low quality."""
    if cheap_low_quality(text):
        # Return a quick label for low-quality reviews
        return {
            "is_relevant_overall": 0,
            "is_advertisement": 0,
            "is_irrelevant": 0,
            "is_rant_without_visit": 0,
            "is_spam": 1,
            "rationale": "Empty/near-empty or low-quality review.",
            "gemma_raw": "", # No Gemma call was made
            "parse_success": 1, # Considered a successful "parse" for the guard
        }
    return gemma_label_one(clean_review(text), place, cat) # Label with Gemma if not low-quality

# Cell 7 — Label a small preview (N_PREVIEW) & peek

This cell labels a small, random sample of the reviews (`N_PREVIEW`) using the defined `label_with_guard` function and displays the first few results. This is useful for a quick sanity check of the labeling process.

In [None]:
import pandas as pd
from tqdm.auto import tqdm

# Ensure df and necessary variables are available
assert 'df' in globals() and 'label_with_guard' in globals(), "Make sure to run previous cells."
N_PREVIEW = 5
# Sample a small preview of the data
preview = df.sample(n=min(N_PREVIEW, len(df)), random_state=42).copy()
rows = []

# Label each review in the preview
for r in tqdm(preview.itertuples(index=False), total=len(preview), desc="preview label"):
    t   = getattr(r, 'text', '')
    # Get business name and category if columns exist
    plc = getattr(r, 'business_name', '') if 'business_name' in preview.columns else ''
    cat = getattr(r, 'category', '') if 'category' in preview.columns else ''
    rows.append(label_with_guard(t, plc, cat))

# Concatenate the original preview data with the new labels
preview = pd.concat([preview.reset_index(drop=True), pd.DataFrame(rows)], axis=1)

# Display the first 5 labeled reviews
display(preview.head(5))

# Cell 8 — Label N_LABEL with checkpoints and consolidate

This cell labels a larger sample of reviews (`N_LABEL`) with checkpointing. It saves the labeled results periodically to Google Drive (`CKPT_DIR`) to avoid losing progress. If checkpoint files are found, it attempts to consolidate them or resume labeling from where it left off. Finally, it consolidates all parts into a single Parquet and CSV file and prints basic statistics.

In [None]:
import os, glob, math, pandas as pd, torch
from tqdm.auto import tqdm

# Ensure necessary variables and directories are defined
OUT_DIR = globals().get('OUT_DIR', '/content/drive/MyDrive/Tiktok_Hackathon') # Use defined OUT_DIR or default
CKPT_DIR_1K = f"{OUT_DIR}/labels_ckpt_1k" # Checkpoint directory for 1k labels
PREVIEW_PARQUET = f"{OUT_DIR}/labeled_preview_1000.parquet" # Final output Parquet file
PREVIEW_CSV     = f"{OUT_DIR}/labeled_preview_1000.csv"     # Final output CSV file

# Ensure Drive is mounted (redundant if Cell 2 ran, but safe check)
if not os.path.exists("/content/drive/MyDrive"):
    from google.colab import drive
    drive.mount('/content/drive')

# Ensure checkpoint directory exists
os.makedirs(CKPT_DIR_1K, exist_ok=True)


# --- Labeling Logic ---

# 1) Check if the final labeled preview file already exists
if os.path.exists(PREVIEW_PARQUET) or os.path.exists(PREVIEW_CSV):
    print("✅ Found existing labeled preview — skipping labeling.")
    labeled_1k = (pd.read_parquet(PREVIEW_PARQUET)
                  if os.path.exists(PREVIEW_PARQUET) # Prefer Parquet
                  else pd.read_csv(PREVIEW_CSV))
    print({"rows": len(labeled_1k)})
else:
    # 2) If checkpoint parts exist, consolidate them
    part_files = sorted(glob.glob(f"{CKPT_DIR_1K}/part_*.parquet"))
    if part_files:
        print(f"ℹ️ Found {len(part_files)} checkpoint parts — consolidating.")
        labeled_1k = pd.concat([pd.read_parquet(p) for p in part_files], ignore_index=True)
        # Save consolidated parts as the final preview
        labeled_1k.to_parquet(PREVIEW_PARQUET, index=False)
        labeled_1k.to_csv(PREVIEW_CSV, index=False)
        print("💾 Saved consolidated preview:", PREVIEW_PARQUET, "|", PREVIEW_CSV)
    else:
        # 3) No preview or parts found, start fresh labeling with checkpoints
        assert 'df' in globals() and 'text' in df.columns, "Need df with a 'text' column. Run Cell 3."
        assert 'label_with_guard' in globals(), "Run the cell that defines label_with_guard() first (Cell 6)."

        N_TARGET = min(N_LABEL, len(df)) # Label up to N_LABEL or the total number of reviews if less
        sample_1k = df.sample(n=N_TARGET, random_state=42).copy() # Sample data for labeling
        sample_1k['orig_index'] = sample_1k.index # Keep original index for reference
        sample_1k = sample_1k.reset_index(drop=True) # Reset index for chunking

        # Optional manifest file to record the sampled data
        manifest_path = f"{CKPT_DIR_1K}/manifest.parquet"
        if not os.path.exists(manifest_path):
            cols = ['orig_index','text'] + [c for c in ['business_name','category'] if c in sample_1k.columns]
            sample_1k[cols].to_parquet(manifest_path, index=False)

        CHUNK = PART_SIZE # Chunk size for checkpointing
        parts = math.ceil(N_TARGET / CHUNK) # Calculate number of chunks
        print(f"Labeling {N_TARGET} rows → {parts} chunks x {CHUNK}")

        labeled_parts = [] # List to store labeled chunks
        for p in range(parts):
            part_path = f"{CKPT_DIR_1K}/part_{p:04d}.parquet"
            if os.path.exists(part_path):
                print(f"⏩ Skipping chunk {p+1}/{parts} (already exists).")
                labeled_parts.append(pd.read_parquet(part_path))
                continue  # Resume: skip if part file already exists

            lo, hi = p*CHUNK, min((p+1)*CHUNK, N_TARGET) # Define chunk slice
            sub = sample_1k.iloc[lo:hi].copy() # Get the current chunk

            rows = []
            # Label each review in the current chunk
            for r in tqdm(sub.itertuples(index=False), total=len(sub), desc=f"chunk {p+1}/{parts}"):
                t   = getattr(r, 'text', '')
                plc = getattr(r, 'business_name', '') if 'business_name' in sub.columns else ''
                cat = getattr(r, 'category', '') if 'category' in sub.columns else ''
                rows.append(label_with_guard(t, plc, cat))

            # Add the labeling results as new columns to the chunk DataFrame
            for k in rows[0].keys():
                sub[k] = [row[k] for row in rows]

            # Save the labeled chunk as a Parquet file
            sub.to_parquet(part_path, index=False)
            labeled_parts.append(sub) # Add labeled chunk to the list
            del sub, rows # Clean up memory
            if torch.cuda.is_available(): torch.cuda.empty_cache() # Clear GPU cache


        # Consolidate fresh parts after labeling is complete
        labeled_1k = pd.concat(labeled_parts, ignore_index=True)
        labeled_1k.to_parquet(PREVIEW_PARQUET, index=False)
        labeled_1k.to_csv(PREVIEW_CSV, index=False)
        print("💾 Saved preview from fresh labeling:", PREVIEW_PARQUET, "|", PREVIEW_CSV)

# Quick stats on the labeled data (if available)
if 'labeled_1k' in globals():
    stats = {
        "rows": len(labeled_1k),
        "ads": int(labeled_1k.get('is_advertisement', pd.Series(dtype=int)).sum()) if 'is_advertisement' in labeled_1k else None,
        "irrel": int(labeled_1k.get('is_irrelevant', pd.Series(dtype=int)).sum()) if 'is_irrelevant' in labeled_1k else None,
        "non_visit": int(labeled_1k.get('is_rant_without_visit', pd.Series(dtype=int)).sum()) if 'is_rant_without_visit' in labeled_1k else None,
        "spam": int(labeled_1k.get('is_spam', pd.Series(dtype=int)).sum()) if 'is_spam' in labeled_1k else None,
        "relevant": int(labeled_1k.get('is_relevant_overall', pd.Series(dtype=int)).sum()) if 'is_relevant_overall' in labeled_1k else None, # Added relevant count
    }
    print("\nLabeled Data Statistics:")
    for key, value in stats.items():
        if value is not None:
            print(f"- {key}: {value}")

# Cell 9 — Build ML‑ready split (preview‑only, multilabel stratified)

This cell prepares the labeled preview data for machine learning by:
- Loading the labeled data.
- Creating binary target columns for each violation type and overall relevance.
- Performing a multilabel stratified shuffle split to create training, validation, and test sets, ensuring that the distribution of labels is maintained across the splits.
- Saving the split data to a Parquet file.
- Printing the support (number of positive instances) for each label in each split.

In [None]:
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# Ensure the labeled preview file exists
assert os.path.exists(PREVIEW_PARQUET), f"Labeled preview file not found at {PREVIEW_PARQUET}. Run Cell 8."

# Load the labeled data and drop duplicates based on text
base = pd.read_parquet(PREVIEW_PARQUET).drop_duplicates(subset=['text']).reset_index(drop=True)
base['text'] = base['text'].fillna('') # Fill any potential missing text values

# Create binary target columns for each label
base['y_relevant']       = base['is_relevant_overall'].astype(int)
base['y_ads']            = base['is_advertisement'].astype(int)
base['y_irrelevant']     = base['is_irrelevant'].astype(int)
base['y_non_visit_rant'] = base['is_rant_without_visit'].astype(int)
base['y_spam']           = base['is_spam'].astype(int)

# Define the target variable array for stratification
Y = base[['y_ads','y_irrelevant','y_non_visit_rant','y_spam','y_relevant']].values.astype(int)
X_dummy = np.zeros((len(base), 1)) # A dummy feature array is needed for split

# Perform the first split (train vs temp for val/test)
msss1 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=42)
train_idx, tmp_idx = next(msss1.split(X_dummy, Y))

# Perform the second split (val vs test) on the temporary set
msss2 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=43) # Different random state
val_rel, test_rel = next(msss2.split(np.zeros((len(tmp_idx),1)), Y[tmp_idx])) # Split on the temporary indices

# Map temporary indices back to original indices
val_idx  = tmp_idx[val_rel]
test_idx = tmp_idx[test_rel]

# Create a 'split' column in the DataFrame
split = np.array(['train']*len(base))
split[val_idx]  = 'val'
split[test_idx] = 'test'
base['split'] = split

# Define the output path for the ML-ready data
ML_READY_PARQUET = f"{OUT_DIR}/labeled_reviews_ml_ready.parquet"
base.to_parquet(ML_READY_PARQUET, index=False) # Save the ML-ready DataFrame
print("Saved ML‑ready →", ML_READY_PARQUET)

# Print supports for each label in each split to verify stratification
sup = lambda d: d[['y_ads','y_irrelevant','y_non_visit_rant','y_spam','y_relevant']].sum().to_dict()
print("\nLabel Supports by Split:")
print("supports train:", sup(base[base.split=='train']))
print("supports val:  ", sup(base[base.split=='val']))
print("supports test: ", sup(base[base.split=='test']))

# Cell 10 — Train baseline (TF‑IDF + Logistic Regression)

This cell trains a baseline machine learning model for each defined label using a combination of TF-IDF word and character features and Logistic Regression. It also finds the optimal classification threshold for each label based on the F1 score on the validation set and saves the trained models, vectorizers, thresholds, and performance metrics.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_recall_fscore_support, precision_recall_curve
from scipy.sparse import hstack
import joblib
import json
import os

# Ensure the ML-ready data and necessary directories exist
assert os.path.exists(ML_READY_PARQUET), f"ML-ready data not found at {ML_READY_PARQUET}. Run Cell 9."

# Load the ML-ready data and split into train, validation, and test sets
ml = pd.read_parquet(ML_READY_PARQUET)
train = ml[ml['split']=='train'].copy()
val   = ml[ml['split']=='val'].copy()
test  = ml[ml['split']=='test'].copy()

# Fill missing text values with empty strings
train['text'] = train['text'].fillna('')
val['text']   = val['text'].fillna('')
test['text']  = test['text'].fillna('')

# Initialize TF-IDF vectorizers for words and characters
word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9, max_features=200_000)
char_vec = TfidfVectorizer(analyzer='char', ngram_range=(3,5), min_df=2, max_features=300_000)

# Fit and transform the training data, and transform validation and test data
Xw_tr = word_vec.fit_transform(train['text']); Xc_tr = char_vec.fit_transform(train['text']); X_tr = hstack([Xw_tr, Xc_tr]).tocsr()
Xw_v  = word_vec.transform(val['text']);   Xc_v  = char_vec.transform(val['text']);   X_v  = hstack([Xw_v,  Xc_v ]).tocsr()
Xw_te = word_vec.transform(test['text']);  Xc_te = char_vec.transform(test['text']);  X_te = hstack([Xw_te, Xc_te]).tocsr()

# Define the binary labels and their corresponding column names
LABELS_BIN = [
    ('y_relevant', 'relevant'),
    ('y_ads', 'ads'),
    ('y_irrelevant', 'irrelevant'),
    ('y_non_visit_rant', 'non_visit'),
    ('y_spam', 'spam'),
]

models = {} # Dictionary to store trained models
thresholds = {} # Dictionary to store optimal thresholds
metrics_rows = [] # List to store performance metrics

# Train a Logistic Regression model for each label
for col, name in LABELS_BIN:
    y_tr = train[col].astype(int).values
    y_v  = val[col].astype(int).values
    y_te = test[col].astype(int).values

    # Initialize and train the Logistic Regression classifier with balanced class weights
    clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear')
    clf.fit(X_tr, y_tr)
    models[name] = clf

    # Predict probabilities on the validation set
    p_v = clf.predict_proba(X_v)[:,1]

    # Find the optimal threshold on the validation set by maximizing F1 score
    grid = np.linspace(0.1, 0.9, 17) # Search grid for thresholds
    f1s = [f1_score(y_v, (p_v>=t).astype(int), zero_division=0) for t in grid]
    t_best = float(grid[int(np.argmax(f1s))]) # Threshold yielding the highest F1
    thresholds[name] = t_best

    # Evaluate the model on validation and test sets using the best threshold
    for split_name, y_true, X in [('val', y_v, X_v), ('test', y_te, X_te)]:
        p = clf.predict_proba(X)[:,1]
        y_hat = (p >= t_best).astype(int)
        # Calculate precision, recall, and F1 score
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
        metrics_rows.append({'label': name, 'split': split_name, 'support_pos': int(y_true.sum()),
                             'precision': round(float(prec),4), 'recall': round(float(rec),4),
                             'f1': round(float(f1),4), 'threshold': t_best})

# Create a DataFrame from the performance metrics
metrics = pd.DataFrame(metrics_rows)
print("F1 Scores by Label and Split:")
# Display F1 scores in a pivot table format
print(metrics.pivot(index='label', columns='split', values='f1').fillna(0))

# Define the directory to save models and results
MODEL_DIR = f"{OUT_DIR}/models/baseline_tfidf_lr"; os.makedirs(MODEL_DIR, exist_ok=True)

# Save the trained vectorizers, models, thresholds, and metrics
joblib.dump(word_vec, f"{MODEL_DIR}/tfidf_word.joblib")
joblib.dump(char_vec, f"{MODEL_DIR}/tfidf_char.joblib")
for name, clf in models.items(): joblib.dump(clf, f"{MODEL_DIR}/clf_{name}.joblib")
with open(f"{MODEL_DIR}/thresholds.json", 'w') as f: json.dump(thresholds, f, indent=2)
metrics.to_csv(f"{MODEL_DIR}/metrics.csv", index=False)
print(f"\nSaved models and metrics to: {MODEL_DIR}")

# Cell 11 — Error analysis (val/test), save misclassifications

This cell performs error analysis on the validation and test sets using the trained baseline models. It calculates performance metrics again and saves misclassified instances (False Positives and False Negatives) to CSV files for further inspection.

In [None]:
import pandas as pd
import joblib
import json
import os
from scipy.sparse import hstack
from sklearn.metrics import precision_recall_fscore_support

# Ensure ML-ready data and model directory exist
assert os.path.exists(ML_READY_PARQUET), f"ML-ready data not found at {ML_READY_PARQUET}. Run Cell 9."
assert os.path.exists(MODEL_DIR), f"Model directory not found at {MODEL_DIR}. Run Cell 10."


# Load the ML-ready data and split into validation and test sets
ml = pd.read_parquet(ML_READY_PARQUET)
val   = ml[ml['split']=='val'].copy()
test  = ml[ml['split']=='test'].copy()

# Load the saved vectorizers, models, and thresholds
word_vec = joblib.load(f"{MODEL_DIR}/tfidf_word.joblib")
char_vec = joblib.load(f"{MODEL_DIR}/tfidf_char.joblib")
models   = {lbl: joblib.load(f"{MODEL_DIR}/clf_{lbl}.joblib") for lbl in ['relevant','ads','irrelevant','non_visit','spam']}
with open(f"{MODEL_DIR}/thresholds.json") as f: thresholds = json.load(f)

# Define the labels and their corresponding target column names
LABELS = [
    ('relevant','y_relevant'),
    ('ads','y_ads'),
    ('irrelevant','y_irrelevant'),
    ('non_visit','y_non_visit_rant'),
    ('spam','y_spam'),
]

def make_X(df_: pd.DataFrame):
    """Applies the trained TF-IDF vectorizers to a DataFrame's text column."""
    Xw = word_vec.transform(df_['text'].fillna(''))
    Xc = char_vec.transform(df_['text'].fillna(''))
    return hstack([Xw, Xc]).tocsr()

# Create feature matrices for validation and test sets
Xv = make_X(val)
Xt = make_X(test)

def eval_split(name: str, df: pd.DataFrame, X):
    """Evaluates the model on a given split and saves misclassified instances."""
    rows = []
    for lbl, ycol in LABELS:
        y_true = df[ycol].astype(int).values
        p = models[lbl].predict_proba(X)[:,1]
        th = thresholds[lbl]
        y_hat = (p >= th).astype(int)
        # Calculate performance metrics for the current label and split
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
        rows.append({'label': lbl, 'split': name, 'precision':prec, 'recall':rec, 'f1':f1,
                     'support_pos': int(y_true.sum()), 'threshold': th})

        # Save False Positives (True=0, Predicted=1)
        df_fp = df[['text']].copy()
        df_fp['y_true'] = y_true
        df_fp['p'] = p
        df_fp['y_hat'] = y_hat
        df_fp[(df_fp.y_true==0)&(df_fp.y_hat==1)].sort_values('p', ascending=False).head(200).to_csv(f"{MODEL_DIR}/misclf_{lbl}_{name}_FP.csv", index=False)

        # Save False Negatives (True=1, Predicted=0)
        df_fn = df[['text']].copy()
        df_fn['y_true'] = y_true
        df_fn['p'] = p
        df_fn['y_hat'] = y_hat
        df_fn[(df_fn.y_true==1)&(df_fn.y_hat==0)].sort_values('p', ascending=True ).head(200).to_csv(f"{MODEL_DIR}/misclf_{lbl}_{name}_FN.csv", index=False)

    return pd.DataFrame(rows)

# Evaluate on validation and test sets
m_val  = eval_split('val',  val,  Xv)
m_test = eval_split('test', test, Xt)

# Concatenate metrics and display F1 scores
metrics2 = pd.concat([m_val, m_test], ignore_index=True)
print("F1 Scores from Error Analysis:")
print(metrics2.pivot(index='label', columns='split', values='f1').round(4))
print(f"\nSaved misclassification CSVs to: {MODEL_DIR}")

# Cell 12 — Threshold calibration from PR curves (val)

This optional cell recalibrates the classification thresholds for each label using the Precision-Recall curve on the validation set. By default, it aims to maximize the F1 score, but you can customize the calibration target. The updated thresholds are saved to `thresholds.json`, overwriting the previous ones. After running this cell, you should re-run Cell 11 to evaluate performance with the new thresholds.

In [None]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.metrics import precision_recall_curve
import joblib

# Ensure necessary files and variables exist
assert os.path.exists(ML_READY_PARQUET), f"ML-ready data not found at {ML_READY_PARQUET}. Run Cell 9."
assert os.path.exists(MODEL_DIR), f"Model directory not found at {MODEL_DIR}. Run Cell 10."

# Load validation data
ml = pd.read_parquet(ML_READY_PARQUET)
val   = ml[ml['split']=='val'].copy()

# Load models and current thresholds
models   = {lbl: joblib.load(f"{MODEL_DIR}/clf_{lbl}.joblib") for lbl in ['relevant','ads','irrelevant','non_visit','spam']}
with open(f"{MODEL_DIR}/thresholds.json") as f: thresholds = json.load(f)

# Recalibration target (default is 'f1')
CALIB_TARGET = 'f1'

new_thresholds = {} # Dictionary to store the newly calibrated thresholds
rows = [] # List to store calibration results

# Create feature matrix for validation data
Xv = make_X(val) # make_X is defined in Cell 11

# Iterate through each label to calibrate the threshold
for lbl, ycol in LABELS: # LABELS is defined in Cell 11
    y_true = val[ycol].astype(int).values
    p = models[lbl].predict_proba(Xv)[:,1] # Get predicted probabilities

    # Calculate Precision-Recall curve
    prec, rec, thr = precision_recall_curve(y_true, p)

    # Calculate F1 scores for each threshold
    f1 = (2*prec*rec)/(prec+rec + 1e-9) # Add a small epsilon to avoid division by zero

    # Find the threshold that maximizes the F1 score
    i = int(np.nanargmax(f1))
    t_best = float(thr[max(0, min(i, len(thr)-1))]) # Get the corresponding threshold, handle edge cases

    # Store the old and new thresholds
    rows.append({'label': lbl, 'old': thresholds[lbl], 'new': t_best})
    new_thresholds[lbl] = t_best

# Display the calibration results
calib = pd.DataFrame(rows)
print("Threshold Calibration Results:")
print(calib)

# Save the new thresholds, overwriting the old ones
with open(f"{MODEL_DIR}/thresholds.json", 'w') as f: json.dump(new_thresholds, f, indent=2)
print(f"\nUpdated thresholds.json in: {MODEL_DIR}")

# Note: After running this cell, you should re-run Cell 11 to evaluate performance with the new thresholds.

# Cell 13 — Batch inference utility (score any DataFrame)

This cell provides a utility function `predict_df` to apply the trained baseline models to a new DataFrame for batch inference. It takes a DataFrame and the name of the text column as input, applies the TF-IDF vectorizers, uses the trained Logistic Regression models with the calibrated thresholds to predict labels, and returns the original DataFrame with added columns for predicted probabilities and binary labels.

In [None]:
import pandas as pd
import joblib
import json
import os
from scipy.sparse import hstack
from tqdm.auto import tqdm # Import tqdm

# Ensure model directory exists and load necessary components
assert os.path.exists(MODEL_DIR), f"Model directory not found at {MODEL_DIR}. Run Cell 10."

# Load the saved vectorizers, models, and thresholds
word_vec = joblib.load(f"{MODEL_DIR}/tfidf_word.joblib")
char_vec = joblib.load(f"{MODEL_DIR}/tfidf_char.joblib")
models   = {lbl: joblib.load(f"{MODEL_DIR}/clf_{lbl}.joblib") for lbl in ['relevant','ads','irrelevant','non_visit','spam']}
with open(f"{MODEL_DIR}/thresholds.json") as f: thresholds = json.load(f) # Reload thresholds (useful after calibration in Cell 12)

# Define the labels (needed for predicting specific columns)
LABELS = [
    ('relevant','y_relevant'),
    ('ads','y_ads'),
    ('irrelevant','y_irrelevant'),
    ('non_visit','y_non_visit_rant'),
    ('spam','y_spam'),
]

# make_X function is defined in Cell 11, ensure it has been run or define it here if running this cell standalone
def make_X(df_: pd.DataFrame):
    """Applies the trained TF-IDF vectorizers to a DataFrame's text column."""
    Xw = word_vec.transform(df_['text'].fillna(''))
    Xc = char_vec.transform(df_['text'].fillna(''))
    return hstack([Xw, Xc]).tocsr()


def predict_df(df_new: pd.DataFrame, text_col='text', batch=20000):
    """
    Applies the trained baseline models to a new DataFrame for batch inference.

    Args:
        df_new (pd.DataFrame): The input DataFrame to score.
        text_col (str): The name of the column containing the text to be used for prediction.
        batch (int): The batch size for processing the DataFrame to manage memory.

    Returns:
        pd.DataFrame: The original DataFrame with added columns for predicted
                      probabilities (p_<label>) and binary predictions (<label>).
    """
    assert text_col in df_new.columns, f"Text column '{text_col}' not found in the input DataFrame."

    N = len(df_new) # Total number of rows in the input DataFrame
    out = [] # List to store results from each batch

    # Process the DataFrame in batches with a progress bar
    for start in tqdm(range(0, N, batch), desc="Scoring batches"): # Added tqdm here
        sl = df_new.iloc[start:start+batch].copy() # Get the current batch slice
        # Create feature matrix for the current batch
        X  = make_X(sl)

        block = {'idx': sl.index} # Store the original index
        # Predict probabilities and binary labels for each label
        for lbl, _ in LABELS:
            p = models[lbl].predict_proba(X)[:,1] # Get probability of the positive class
            block[f'p_{lbl}'] = p # Store predicted probabilities
            block[f'{lbl}']   = (p >= thresholds[lbl]).astype(int) # Apply threshold for binary prediction
        out.append(pd.DataFrame(block)) # Add batch results to the list

    # Concatenate batch results, set index to original index, and merge with original DataFrame
    pred = pd.concat(out, ignore_index=True).set_index('idx').reindex(df_new.index)
    return pd.concat([df_new, pred], axis=1) # Return original DataFrame with predictions


# Example usage (uncomment to run a small example)
# print("Running example prediction on the first 100 rows of ml data...")
# try:
#     # Ensure ml data is available (from Cell 9)
#     if 'ml' not in globals():
#          assert os.path.exists(ML_READY_PARQUET), f"ML-ready data not found at {ML_READY_PARQUET}. Run Cell 9."
#          ml = pd.read_parquet(ML_READY_PARQUET)
#
#     scored = predict_df(ml.head(100))
#     display(scored.head())
#     print("\nExample prediction complete.")
# except Exception as e:
#     print(f"\nError during example prediction: {e}")

In [None]:
# Apply the trained model to the entire df DataFrame
print("Running prediction on the entire df DataFrame...")

# Ensure df data is available (from Cell 3)
if 'df' not in globals():
     assert os.path.exists(REVIEW_PATH) and os.path.exists(META_PATH), "Review/meta files not found. Run Cell 3."
     # Reload df if not in globals (assuming Cell 3's logic)
     df_reviews = parse_jsonl(REVIEW_PATH)
     df_meta    = parse_jsonl(META_PATH)
     if 'gmap_id' in df_meta.columns:
         df_meta = df_meta.drop_duplicates(subset=['gmap_id'])
     df = df_reviews.merge(df_meta, on='gmap_id', how='left')
     keep_cols = [c for c in ['user_id','gmap_id','text','time','rating'] if c in df.columns]
     if keep_cols:
         df = df.drop_duplicates(subset=keep_cols, keep='first')
     ren = {}
     if 'name_x' in df.columns: ren['name_x'] = 'reviewer_name'
     if 'name_y' in df.columns: ren['name_y'] = 'business_name'
     if ren:
         df = df.rename(columns=ren)


try:
    scored_df = predict_df(df)
    print("\nPrediction complete. Displaying head of scored DataFrame:")
    display(scored_df.head())
    print("\nScored DataFrame shape:", scored_df.shape)

except Exception as e:
    print(f"\nError during prediction on entire df: {e}")

In [None]:
#show the distribution of the predicted labels
print(scored_df['relevant'].value_counts())
print(scored_df['ads'].value_counts())
print(scored_df['irrelevant'].value_counts())
print(scored_df['non_visit'].value_counts())
print(scored_df['spam'].value_counts())

### Examples of Reviews Predicted as 'ads'

This cell displays the reviewer name, rating, full text, and business name for the first 10 reviews that were predicted to contain advertisements.

In [None]:
# Display examples of reviews predicted as 'ads'
import pandas as pd
pd.set_option('display.max_colwidth', None) # Ensure full text is displayed

if 'scored_df' in globals():
    display(scored_df[scored_df['ads']==1][['reviewer_name','rating','text','business_name']].head(10))
else:
    print("The 'scored_df' DataFrame is not available. Please run the prediction cells first.")

### Examples of Reviews Predicted as 'irrelevant'

This cell displays the reviewer name, rating, full text, and business name for the first 10 reviews that were predicted to be irrelevant.

In [None]:
# Display examples of reviews predicted as 'irrelevant'
import pandas as pd
pd.set_option('display.max_colwidth', None) # Ensure full text is displayed

if 'scored_df' in globals():
    display(scored_df[scored_df['irrelevant']==1][['reviewer_name','rating','text','business_name']].head(10))
else:
    print("The 'scored_df' DataFrame is not available. Please run the prediction cells first.")

### Examples of Reviews Predicted as 'non_visit_rant'

This cell displays the reviewer name, rating, full text, and business name for the first 10 reviews that were predicted to be rants without a clear visit.

In [None]:
# Display examples of reviews predicted as 'non_visit_rant'
import pandas as pd
pd.set_option('display.max_colwidth', None) # Ensure full text is displayed

if 'scored_df' in globals():
    display(scored_df[scored_df['non_visit']==1][['reviewer_name','rating','text','business_name']].head(10))
else:
    print("The 'scored_df' DataFrame is not available. Please run the prediction cells first.")

### Examples of Reviews Predicted as 'spam'

This cell displays the reviewer name, rating, full text, and business name for the first 10 reviews that were predicted to be spam.

In [None]:
# Display examples of reviews predicted as 'spam'
import pandas as pd
pd.set_option('display.max_colwidth', None) # Ensure full text is displayed

if 'scored_df' in globals():
    display(scored_df[scored_df['spam']==1][['reviewer_name','rating','text','business_name']].head(10))
else:
    print("The 'scored_df' DataFrame is not available. Please run the prediction cells first.")