In [10]:
# ==========================================
# Phase 1: Data Preparation
#
# Loads all gold-standard CSVs, deduplicates by text (keeping the
# latest human correction), then manages the test lockbox:
#   - First run:  create initial 90/10 split
#   - Later runs: append 10% of NEW data only (never reshuffle)
#
# "New data" = texts not in test lockbox AND not in previous staging.
# Training set = ALL gold data minus the full test lockbox (not just
# new data), so the model always trains on the complete history.
#
# Safe to re-run: if no new data exists, test lockbox is unchanged
# and training set is rebuilt identically from gold minus test.
# ==========================================

import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from config import DIRS

STAGING_PATH = f"{DIRS['staging']}/ready_for_training.csv"
TEST_PATH = f"{DIRS['test_lockbox']}/final_test_set.csv"


def load_and_standardize(filepath):
    """Load a gold CSV and normalize to [text, label] schema."""
    df = pd.read_csv(filepath)
    if 'gpt_label' in df.columns and 'label' not in df.columns:
        df = df.rename(columns={'gpt_label': 'label'})
    try:
        return df[['text', 'label']]
    except KeyError:
        print(f"  SKIP: {os.path.basename(filepath)} (missing 'text' or 'label' column)")
        return pd.DataFrame()


# -- Load all gold-standard files --
files = glob.glob(f"{DIRS['gold']}/*.csv")
if not files:
    raise ValueError(f"No files in {DIRS['gold']}/. Run Notebook 01 first.")

print(f"Loading {len(files)} gold file(s)...")
df_list = [load_and_standardize(f) for f in files]
full_dataset = pd.concat(df_list, ignore_index=True)

# Clean: drop NaN rows, normalize labels to lowercase
full_dataset = full_dataset.dropna(subset=['text', 'label'])
full_dataset['label'] = full_dataset['label'].str.lower()

# Deduplicate by text (keep latest = most recent human correction wins)
before_dedup = len(full_dataset)
full_dataset = full_dataset.drop_duplicates(subset=['text'], keep='last')
after_dedup = len(full_dataset)
if before_dedup > after_dedup:
    print(f"Dedup: {before_dedup} -> {after_dedup} (removed {before_dedup - after_dedup} duplicates)")

# -- Test lockbox: create once, then append-only (never reshuffle existing rows) --
if os.path.exists(TEST_PATH):
    existing_test = pd.read_csv(TEST_PATH)
    print(f"Test lockbox exists: {len(existing_test)} rows (locked)")

    # "New" = not in test lockbox AND not in previous training staging.
    # This prevents re-runs from treating old training rows as new data
    # and repeatedly moving them into the test lockbox.
    known_texts = set(existing_test['text'].tolist())
    if os.path.exists(STAGING_PATH):
        known_texts |= set(pd.read_csv(STAGING_PATH)['text'].tolist())

    new_data = full_dataset[~full_dataset['text'].isin(known_texts)]
    print(f"New data: {len(new_data)} rows")

    if len(new_data) > 0:
        _, new_test = train_test_split(new_data, test_size=0.1, random_state=42)
        updated_test = pd.concat([existing_test, new_test], ignore_index=True)
        updated_test.to_csv(TEST_PATH, index=False)
        print(f"Test lockbox updated: {len(existing_test)} -> {len(updated_test)} (+{len(new_test)} appended)")
    else:
        updated_test = existing_test
        print("No new data for test lockbox.")

    # Training set = ALL gold minus the full test lockbox.
    # This ensures the model always trains on complete history, not just new rows.
    train_df = full_dataset[~full_dataset['text'].isin(updated_test['text'])]

else:
    # First run: create initial 90/10 split
    train_df, test_df = train_test_split(full_dataset, test_size=0.1, random_state=42)
    test_df.to_csv(TEST_PATH, index=False)
    print(f"Test lockbox created: {len(test_df)} rows")

# -- Guard: abort early if nothing to train on --
if train_df.empty:
    raise ValueError(
        "Training set is empty â€” all data is already in the test lockbox. "
        "Add new labeled data to 04_gold_standard/ before retraining."
    )

train_df.to_csv(STAGING_PATH, index=False)

print(f"\nData preparation complete:")
print(f"  Test set:  {len(pd.read_csv(TEST_PATH))} rows (locked)")
print(f"  Train set: {len(train_df)} rows (ready)")

Loading 2 gold file(s)...
Test lockbox created: 99 rows

Data preparation complete:
  Test set:  99 rows (locked)
  Train set: 883 rows (ready)


In [11]:
# ==========================================
# Phase 2: Tokenization
#
# Reads training and test CSVs, maps string labels to integers
# (with .str.lower() to prevent silent drops from case mismatch),
# then tokenizes text with the DistilBERT tokenizer.
# ==========================================

import pandas as pd
import os
import datetime
import torch
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer
from config import DIRS, MODEL_NAME, MAX_LENGTH, LABEL_MAP

TODAY = datetime.date.today().strftime("%Y%m%d")
NOW = datetime.datetime.now().strftime("%H%M%S")
OUTPUT_DIR = f"{DIRS['models']}/v_{TODAY}_{NOW}"

# -- Load data --
df_train = pd.read_csv(f"{DIRS['staging']}/ready_for_training.csv")
df_test = pd.read_csv(f"{DIRS['test_lockbox']}/final_test_set.csv")

# -- Map labels to integers (normalize case first to prevent silent NaN drops) --
df_train['label'] = df_train['label'].str.lower().map(LABEL_MAP)
df_test['label'] = df_test['label'].str.lower().map(LABEL_MAP)

df_train = df_train.dropna(subset=['label', 'text'])
df_test = df_test.dropna(subset=['label', 'text'])

df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

# -- Build HuggingFace datasets with ClassLabel type --
train_ds = Dataset.from_pandas(df_train, preserve_index=False)
test_ds = Dataset.from_pandas(df_test, preserve_index=False)

c_label = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])
train_ds = train_ds.cast_column("label", c_label)
test_ds = test_ds.cast_column("label", c_label)

# -- Tokenize text --
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

tokenized_train = train_ds.map(tokenize, batched=True)
tokenized_test = test_ds.map(tokenize, batched=True)

tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

print(f"Tokenization complete. Train: {len(df_train)} rows, Test: {len(df_test)} rows")
print(f"Model will be saved to: {OUTPUT_DIR}")

Casting the dataset:   0%|          | 0/883 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/883 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Tokenization complete. Train: 883 rows, Test: 99 rows
Model will be saved to: ./06_models/v_20260218_132249


In [12]:
# ==========================================
# Phase 3: Model Training & Evaluation
#
# Fine-tunes DistilBERT on the prepared dataset, evaluates on the
# locked test set, saves model + model_meta.json with full metrics
# (accuracy, macro F1, per-class precision/recall/F1, confusion matrix),
# then cleans up intermediate checkpoints to prevent disk bloat.
# ==========================================

import json
import shutil
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(labels, preds)}


# -- Load pre-trained model --
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2},
)

# -- Training config --
TEMP_DIR = "./results_temp"

training_args = TrainingArguments(
    output_dir=TEMP_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# -- Train --
print(f"Training started... Target: {OUTPUT_DIR}")
trainer.train()

# -- Save final model + tokenizer --
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# -- Full evaluation on test lockbox --
label_names = ['negative', 'neutral', 'positive']
predictions = trainer.predict(tokenized_test)
pred_ids = predictions.predictions.argmax(-1)
true_ids = predictions.label_ids

# Per-class metrics
report = classification_report(true_ids, pred_ids, target_names=label_names, output_dict=True)
cm = confusion_matrix(true_ids, pred_ids).tolist()

# Print readable report
print("\n" + "=" * 50)
print("Evaluation Report")
print("=" * 50)
print(classification_report(true_ids, pred_ids, target_names=label_names, digits=3))

print("Confusion Matrix (rows=true, cols=predicted):")
import pandas as _pd
cm_df = _pd.DataFrame(cm,
    index=[f"TRUE:{l}" for l in label_names],
    columns=[f"PRED:{l}" for l in label_names])
print(cm_df)

# -- Save model_meta.json with full metrics --
meta = {
    "model_name": os.path.basename(OUTPUT_DIR),
    "accuracy": round(report["accuracy"], 4),
    "macro_f1": round(report["macro avg"]["f1-score"], 4),
    "weighted_f1": round(report["weighted avg"]["f1-score"], 4),
    "per_class": {
        name: {
            "precision": round(report[name]["precision"], 4),
            "recall": round(report[name]["recall"], 4),
            "f1": round(report[name]["f1-score"], 4),
            "support": report[name]["support"],
        }
        for name in label_names
    },
    "confusion_matrix": cm,
    "train_samples": len(df_train),
    "test_samples": len(df_test),
    "base_model": MODEL_NAME,
    "max_length": MAX_LENGTH,
    "created_at": datetime.datetime.now().isoformat(),
}
meta_path = f"{OUTPUT_DIR}/model_meta.json"
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

# -- Clean up intermediate checkpoints to save disk space --
if os.path.exists(TEMP_DIR):
    shutil.rmtree(TEMP_DIR)
    print(f"\nCleaned up temp checkpoints: {TEMP_DIR}/")

print("-" * 50)
print(f"Training complete!")
print(f"  Accuracy:  {meta['accuracy']:.2%}")
print(f"  Macro F1:  {meta['macro_f1']:.2%}")
print(f"  Model:     {OUTPUT_DIR}")
print("-" * 50)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training started... Target: ./06_models/v_20260218_132249




Epoch,Training Loss,Validation Loss,Accuracy
1,0.7275,0.748451,0.747475
2,0.4971,0.692904,0.777778
3,0.574,0.676605,0.767677





Evaluation Report
              precision    recall  f1-score   support

    negative      0.744     0.800     0.771        40
     neutral      0.864     0.594     0.704        32
    positive      0.735     0.926     0.820        27

    accuracy                          0.768        99
   macro avg      0.781     0.773     0.765        99
weighted avg      0.780     0.768     0.763        99

Confusion Matrix (rows=true, cols=predicted):
               PRED:negative  PRED:neutral  PRED:positive
TRUE:negative             32             3              5
TRUE:neutral               9            19              4
TRUE:positive              2             0             25

Cleaned up temp checkpoints: ./results_temp/
--------------------------------------------------
Training complete!
  Accuracy:  76.77%
  Macro F1:  76.48%
  Model:     ./06_models/v_20260218_132249
--------------------------------------------------
