In [None]:
# ==========================================
# STAGE 1: Predict & Triage
#
# Loads the best model (selected by accuracy from model_meta.json),
# runs inference on new data, then splits by confidence threshold:
#   - High confidence (>= threshold) → 07_daily_workspace (auto-pass)
#   - Low confidence  (<  threshold) → 03_manual_review (needs human)
# ==========================================

import json
import pandas as pd
import glob
import os
import datetime
from transformers import pipeline
from config import DIRS, MAX_LENGTH

# -- Config (edit before each run) --
CURRENT_PROJECT = "cold_start"
CONFIDENCE_THRESHOLD = 0.7
TODAY = datetime.date.today().strftime("%Y%m%d")

print(f"Project: [{CURRENT_PROJECT}]")

# -- Load input data --
input_folder = f"{DIRS['raw']}/{CURRENT_PROJECT}"
files = glob.glob(f"{input_folder}/*.csv")

if not files:
    raise FileNotFoundError(f"No CSV found in {input_folder}/. Add data before running.")

raw_file = sorted(files)[-1]
print(f"Input: {os.path.basename(raw_file)}")

df = pd.read_csv(raw_file)

# Standardize text column name so downstream stages can rely on 'text'
if 'content' in df.columns and 'text' not in df.columns:
    df.rename(columns={'content': 'text'}, inplace=True)

# -- Select best model by accuracy from model_meta.json --
model_dirs = sorted(glob.glob(f"{DIRS['models']}/v_*"))
if not model_dirs:
    raise FileNotFoundError(f"No model found in {DIRS['models']}/. Run Notebook 02 first.")

best_model = None
best_acc = -1

for d in model_dirs:
    meta_path = os.path.join(d, "model_meta.json")
    if os.path.exists(meta_path):
        with open(meta_path) as f:
            meta = json.load(f)
        acc = meta.get("accuracy", 0)
        if acc > best_acc:
            best_acc = acc
            best_model = d

# Fallback: if no model has metadata, use the newest directory
if best_model is None:
    best_model = model_dirs[-1]

if best_acc >= 0:
    print(f"Model: {os.path.basename(best_model)} (acc={best_acc:.4f})")
else:
    print(f"Model: {os.path.basename(best_model)} (no metadata)")

# -- Run inference --
classifier = pipeline("text-classification", model=best_model, tokenizer=best_model, top_k=None)
print(f"Predicting {len(df)} rows...")

texts = df['text'].astype(str).tolist()
preds = classifier(texts, truncation=True, max_length=MAX_LENGTH)

results = []
for p in preds:
    best = max(p, key=lambda x: x['score'])
    results.append({"predicted_label": best['label'], "confidence": best['score']})

df_pred = pd.concat([df, pd.DataFrame(results)], axis=1)

# -- Split by confidence threshold --
good_df = df_pred[df_pred['confidence'] >= CONFIDENCE_THRESHOLD]
bad_df = df_pred[df_pred['confidence'] < CONFIDENCE_THRESHOLD]

# Auto-pass → daily workspace
good_path = f"{DIRS['workspace']}/{TODAY}_{CURRENT_PROJECT}_auto.csv"
good_df.to_csv(good_path, index=False)

# Low confidence → review queue
if not bad_df.empty:
    bad_path = f"{DIRS['review']}/{TODAY}_{CURRENT_PROJECT}_review.csv"
    bad_df.to_csv(bad_path, index=False)
    print(f"Low confidence: {len(bad_df)} rows -> {os.path.basename(bad_path)}")
else:
    print("All rows passed confidence threshold.")

print(f"STAGE 1 complete. Auto: {len(good_df)}, Review: {len(bad_df)}")

In [None]:
# ==========================================
# STAGE 2: Merge & Deliver
#
# Merges auto-pass data with human corrections. Supports partial
# review: rows with `label` filled = reviewed, rows with `label`
# empty = unreviewed (uses model prediction as fallback).
#
# Input: Stage 1's auto-pass (07_workspace) always has 'predicted_label'.
#        Corrected file (07_workspace) has user-added 'label' column.
#
# Outputs:
#   - 04_gold_standard: only human-reviewed rows (training fuel)
#   - 03_manual_review: unreviewed rows written back as pending
#   - 08_client_reports: complete report with label_source tracking
# ==========================================

import pandas as pd
import glob
import os
import datetime
from config import DIRS

CURRENT_PROJECT = "cold_start"
TODAY = datetime.date.today().strftime("%Y%m%d")

print(f"STAGE 2: Merging project [{CURRENT_PROJECT}]...")

# -- 1. Load auto-pass data from daily workspace --
df_auto = pd.DataFrame()
auto_pattern = f"{DIRS['workspace']}/{TODAY}_{CURRENT_PROJECT}_auto.csv"
auto_files = glob.glob(auto_pattern)

if auto_files:
    df_auto = pd.read_csv(auto_files[0])
    # Rename model prediction to canonical 'label' for report merging
    df_auto.rename(columns={'predicted_label': 'label'}, inplace=True)
    df_auto['label_source'] = 'auto'
    print(f"  Auto-pass: {len(df_auto)} rows")
else:
    print("  Auto-pass: not found")

# -- 2. Load corrected file and split by review status --
df_reviewed = pd.DataFrame()
df_pending = pd.DataFrame()

fixed_pattern = f"{DIRS['workspace']}/*{CURRENT_PROJECT}*corrected*.csv"
fixed_files = glob.glob(fixed_pattern)

if fixed_files:
    latest_fixed = sorted(fixed_files)[-1]
    df_corrected = pd.read_csv(latest_fixed)
    print(f"  Corrected file: {len(df_corrected)} rows ({os.path.basename(latest_fixed)})")

    # Split: label filled = human-reviewed, label empty = unreviewed
    if 'label' in df_corrected.columns:
        mask_reviewed = df_corrected['label'].notna() & (df_corrected['label'].astype(str).str.strip() != '')
        df_reviewed = df_corrected[mask_reviewed].copy()
        df_pending = df_corrected[~mask_reviewed].copy()
    else:
        # No label column at all = entirely unreviewed
        df_pending = df_corrected.copy()

    print(f"    Reviewed: {len(df_reviewed)} rows")
    print(f"    Unreviewed: {len(df_pending)} rows")

    if not df_reviewed.empty:
        df_reviewed['label_source'] = 'human'

    # Fallback for unreviewed: use model prediction as label
    if not df_pending.empty:
        df_pending['label'] = df_pending['predicted_label']
        df_pending['label_source'] = 'model_pending'
else:
    print("  Corrected file: not found")

# -- 3. Gold: archive only human-reviewed rows (training fuel) --
if not df_reviewed.empty:
    gold_path = f"{DIRS['gold']}/{TODAY}_{CURRENT_PROJECT}_corrected.csv"
    gold_df = df_reviewed[['text', 'label']].copy()
    gold_df['gold_origin'] = 'daily_corrected'
    gold_df['created_at'] = datetime.datetime.now().isoformat()
    gold_df.to_csv(gold_path, index=False)
    print(f"  Gold: {len(gold_df)} reviewed rows archived")

# -- 4. Pending: write unreviewed rows back to review queue --
if not df_pending.empty:
    pending_path = f"{DIRS['review']}/{TODAY}_{CURRENT_PROJECT}_pending.csv"
    # Strip temporary columns before writing back
    cols_to_drop = ['label', 'label_source']
    df_pending_save = df_pending.drop(columns=[c for c in cols_to_drop if c in df_pending.columns])
    df_pending_save.to_csv(pending_path, index=False)
    print(f"  Pending: {len(df_pending_save)} rows written back to review queue")

# -- 5. Final report: merge all parts with label_source tracking --
report_parts = []
if not df_auto.empty:
    report_parts.append(df_auto)
if not df_reviewed.empty:
    report_parts.append(df_reviewed)
if not df_pending.empty:
    report_parts.append(df_pending)

if not report_parts:
    print("ERROR: No data found for report.")
else:
    # Deterministic column order: key columns first, then alphabetical
    common_cols = set(report_parts[0].columns)
    for part in report_parts[1:]:
        common_cols &= set(part.columns)

    priority = ['text', 'label', 'label_source', 'confidence', 'predicted_label']
    ordered_cols = [c for c in priority if c in common_cols]
    ordered_cols += sorted(c for c in common_cols if c not in priority)

    df_final = pd.concat([p[ordered_cols] for p in report_parts], ignore_index=True)

    report_path = f"{DIRS['reports']}/{TODAY}_{CURRENT_PROJECT}_Final_Report.csv"
    df_final.to_csv(report_path, index=False)

    src = df_final['label_source'].value_counts().to_dict()
    print("-" * 30)
    print(f"STAGE 2 complete: {report_path}")
    print(f"  Total: {len(df_final)} rows")
    print(f"  auto: {src.get('auto', 0)} | human: {src.get('human', 0)} | model_pending: {src.get('model_pending', 0)}")
    print("-" * 30)