In [None]:
!pip install -U transformers datasets evaluate accelerate -q   

In [None]:
!pip install scikit-learn 
!pip install pandas
!pip install numpy
!pip install matplotlib

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 

In [None]:
import transformers
print(transformers.__version__)
from transformers import TrainingArguments
help(TrainingArguments) 

In [None]:
import os, json, logging, numpy as np, pandas as pd, matplotlib.pyplot as plt, torch
from datasets import load_dataset
from transformers import ( 
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


In [None]:
# Reproducibility & Paths

set_seed(42)
OUTPUT_DIR = "outputs/sst2"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# Logging setup

logging.basicConfig(
    filename=os.path.join(OUTPUT_DIR, "training.log"),
    filemode="w",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)
logging.info("Starting run")


In [None]:
# Load GLUE/SST-2 dataset

dataset = load_dataset("glue", "sst2")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_fn, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

train_ds = dataset["train"]
val_ds   = dataset["validation"]


In [None]:
# Model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) 



In [None]:
# Metrics function

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds  = np.argmax(eval_pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
# Training Args

training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "results"),
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",  
)



In [None]:
# Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [None]:
# Train

train_out = trainer.train()
logging.info("Training finished")

# Save log history
log_history = trainer.state.log_history
pd.DataFrame(log_history).to_csv(os.path.join(OUTPUT_DIR, "training_log_history.csv"), index=False)



In [None]:
# Evaluate
 
eval_results = trainer.evaluate()
print("Evaluation:", eval_results)
with open(os.path.join(OUTPUT_DIR, "eval_results.json"), "w") as f:
    json.dump(eval_results, f, indent=2)


In [None]:
# Predictions & Confusion Matrix

preds_output = trainer.predict(val_ds) 
y_true = preds_output.label_ids
y_pred = np.argmax(preds_output.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
fig = plt.figure(figsize=(6,6))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix - SST-2 (BERT)")
plt.colorbar()
tick_marks = np.arange(2)
classes = ["Negative","Positive"]
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], "d"),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=200)
plt.close(fig)


In [None]:
import pandas as pd 

# Convert validation split to pandas DataFrame
val_df = dataset["validation"].to_pandas()

# Save sample predictions (first 50 validation rows)
sample_df = pd.DataFrame({
    "sentence": val_df["sentence"].iloc[:50],
    "true_label": val_df["labels"].iloc[:50],
    "pred_label": y_pred[:50]
})

# Save as CSV
sample_df.to_csv(r"D:\Zoro_project2\zero-short-text-classification\outputs\sst2\sample_predictions.csv", index=False)



In [None]:
# Training Loss Plot

hist_df = pd.DataFrame(log_history)
train_loss_df = hist_df[hist_df["loss"].notna()][["step", "loss"]].drop_duplicates(subset=["step"])
fig = plt.figure(figsize=(7,4))
plt.plot(train_loss_df["step"], train_loss_df["loss"])
plt.xlabel("Step")
plt.ylabel("Training Loss")
plt.title("Training Loss vs Step")
plt.grid(True, linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "training_loss_curve.png"), dpi=200)
plt.close(fig)


In [None]:
# Eval Metrics over Epochs

metrics_cols = ["eval_accuracy", "eval_precision", "eval_recall", "eval_f1", "epoch"]
eval_hist = hist_df.dropna(subset=["epoch"])
eval_hist = eval_hist[[c for c in metrics_cols if c in eval_hist.columns]].drop_duplicates(subset=["epoch"])

for metric in ["eval_accuracy", "eval_precision", "eval_recall", "eval_f1"]:
    if metric in eval_hist.columns:
        fig = plt.figure(figsize=(6,4))
        plt.plot(eval_hist["epoch"], eval_hist[metric], marker="o")
        plt.xlabel("Epoch")
        plt.ylabel(metric.replace("eval_", "").title())
        plt.title(f"{metric.replace('eval_', '').upper()} vs Epoch")
        plt.xticks(eval_hist["epoch"])
        plt.grid(True, linestyle="--", linewidth=0.5)
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f"{metric}_vs_epoch.png"), dpi=200)
        plt.close(fig)



In [None]:
# Bar chart of final eval metrics

final_metrics = {  
    "Accuracy":   float(eval_results.get("eval_accuracy", 0.0)),
    "Precision":  float(eval_results.get("eval_precision", 0.0)),  
    "Recall":     float(eval_results.get("eval_recall", 0.0)),
    "F1":         float(eval_results.get("eval_f1", 0.0)),
}
fig = plt.figure(figsize=(6,4))
plt.bar(list(final_metrics.keys()), list(final_metrics.values()))
plt.ylim(0, 1.0)
plt.ylabel("Score")
plt.title("Final Evaluation Metrics (Validation)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "final_metrics_bar.png"), dpi=200)
plt.close(fig)

print(
    f" Done! All outputs saved in {OUTPUT_DIR}/\n"
    "- training.log\n"
    "- training_log_history.csv\n"
    "- eval_results.json\n"
    "- confusion_matrix.png\n"
    "- sample_predictions.csv\n"
    "- training_loss_curve.png\n"
    "- eval_accuracy_vs_epoch.png (and precision/recall/f1 variants)\n"
    "- final_metrics_bar.png\n"
)