In [21]:

import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay
    )
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional import softmax
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
from matplotlib.backends.backend_pdf import PdfPages

In [22]:
project_root = "/Users/tayebekavousi/Desktop/github_sa"
# Save the original directory to go back to it later if needed
original_dir = os.getcwd()
# Change to the project root directory
os.chdir(project_root)
# Ensure the project root is in the Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
sys.path.insert(0, '')  # Add current directory (empty string) to path

print("Environment setup complete!")

Environment setup complete!


In [23]:
# Global settings
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_PATHS = {
    "deberta": "saved_models/deberta_3class/saved_full_model",
    "codebert": "saved_models/codebert_3class/saved_full_model",
    "distilbert": "saved_models/distilbert_3class/saved_full_model"
}

model_name = "deberta"  # Change this to 'codebert' 'deberta' or 'distilbert' as needed


In [24]:

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }


def load_model_and_tokenizer(model_name):
    """
    Load best model checkpoint from .pt file with training metadata included.
    """
    model_dir = MODEL_PATHS[model_name]
    
    # Load config and tokenizer
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    
    # Create model architecture
    model = AutoModelForSequenceClassification.from_config(config)

    # Load best checkpoint file
    best_model_files = [f for f in os.listdir(model_dir) if f.startswith("model_best_f1") and f.endswith(".pt")]
    if not best_model_files:
        raise FileNotFoundError(f"No best model checkpoint found in {model_dir}")
    
    best_model_path = os.path.join(model_dir, best_model_files[0])
    print(f"[INFO] 🔥 Loading best model from: {best_model_path}")
    
    checkpoint = torch.load(best_model_path, map_location=DEVICE)
    
    # Extract and load only model weights
    model.load_state_dict(checkpoint["model_state_dict"])

    model.to(DEVICE)
    model.eval()

    return model, tokenizer


In [25]:

# Load and clean dataset
df = pd.read_csv("datasets/preprocessed/combined_DeepSentimentSECrossPlatform.csv")
df = df.dropna(subset=["text"]).drop_duplicates(subset=["text"])

# Map sentiment [-1, 0, 1] → [0, 1, 2]
df["label"] = df["sentiment"].map({-1: 0, 0: 1, 1: 2})

# Extract texts and labels
texts = df["text"].astype(str).tolist()
labels = df["label"].tolist()

# First: Train (75%) vs Temp (25%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.25, stratify=labels, random_state=42
)

# Then: Validation (15%) and Test (10%) from Temp
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.4,  # 40% of 25% = 10% total
    stratify=temp_labels,
    random_state=42
)


In [26]:

model, tokenizer = load_model_and_tokenizer(model_name)

test_dataset = ReviewDataset(test_texts, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32)

all_preds, all_labels, all_probs = [], [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())


[INFO] 🔥 Loading best model from: saved_models/deberta_3class/saved_full_model/model_best_f1_0.8954.pt


In [27]:
# Save model predictions to CSV for evaluation
test_df = pd.DataFrame({
    "text": test_texts,
    "true_label": all_labels,
    "predicted_label": all_preds,
    "confidence": [max(p) for p in all_probs],
})
test_df["correct"] = test_df["true_label"] == test_df["predicted_label"]

output_path = f"evaluation_result/{model_name}_predictions.csv"
test_df.to_csv(output_path, index=False)
print(f"✅ Saved predictions to {output_path}")


✅ Saved predictions to evaluation_result/deberta_predictions.csv


# ✅ Cell 1: Run Inference on Validation Set
## This cell generates val_preds, val_probs, and val_labels using the same tokenizer and model you're already using for test inference.

In [28]:

# 📦 Inference on Validation Set
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=32)

val_preds, val_probs = [], []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        val_preds.extend(preds.cpu().tolist())
        val_probs.extend(probs.cpu().tolist())


# ✅ Cell 2: Generate Separate PDF Reports
This cell creates:

test_report_{model_name}.pdf

val_report_{model_name}.pdf

Each report includes:

Confusion matrix

Normalized matrix

Overall metrics

Class-wise metrics

In [29]:
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Create output directories
os.makedirs("evaluation_result", exist_ok=True)
os.makedirs("error_analysis", exist_ok=True)

class_names = ["Negative", "Neutral", "Positive"]

def generate_report(y_true, y_pred, title_prefix):
    # Distribution plot
    fig1, ax1 = plt.subplots()
    pd.Series(y_true).value_counts().sort_index().plot(kind="bar", ax=ax1, color=["#1f77b4", "#aec7e8", "#c6dbef"])
    ax1.set_title(f"Class Distribution in {title_prefix} Set")
    ax1.set_xticklabels(class_names, rotation=0)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    fig2, ax2 = plt.subplots()
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names, ax=ax2)
    ax2.set_title(f"{title_prefix} Confusion Matrix")
    ax2.set_xlabel("Predicted")
    ax2.set_ylabel("True")

    # Normalized Confusion Matrix
    cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
    fig3, ax3 = plt.subplots()
    sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues", xticklabels=class_names, yticklabels=class_names, ax=ax3)
    ax3.set_title(f"{title_prefix} Normalized Confusion Matrix")
    ax3.set_xlabel("Predicted")
    ax3.set_ylabel("True")


    # Overall Metrics Table
    acc = accuracy_score(y_true, y_pred)
    prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rec_macro = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
    
    fig4, ax4 = plt.subplots()
    ax4.axis("off")
    overall = [
        ["Accuracy", f"{acc:.2f}"],
        ["Macro Precision", f"{prec_macro:.2f}"],
        ["Macro Recall", f"{rec_macro:.2f}"],
        ["Macro F1-Score", f"{f1_macro:.2f}"],
    ]
    ax4.table(cellText=overall, colLabels=["Metric", "Score"], loc="center", cellLoc="center").set_fontsize(10)
    ax4.set_title(f"{title_prefix} Set Overall Metrics", fontweight="bold", pad=20)
    
    # Class-wise Metrics Table
    prec = precision_score(y_true, y_pred, average=None, zero_division=0)
    rec = recall_score(y_true, y_pred, average=None, zero_division=0)
    f1s = f1_score(y_true, y_pred, average=None, zero_division=0)
    
    fig5, ax5 = plt.subplots()
    ax5.axis("off")
    classwise = [
        [class_names[i], f"{prec[i]:.2f}", f"{rec[i]:.2f}", f"{f1s[i]:.2f}"]
        for i in range(3)
    ]
    ax5.table(cellText=classwise, colLabels=["Class", "Precision", "Recall", "F1-Score"], loc="center", cellLoc="center").set_fontsize(10)
    ax5.set_title(f"{title_prefix} Set Class-wise Metrics", fontweight="bold", pad=20)

    return [fig1, fig2, fig3, fig4, fig5]

# ────────────────────────────────────────
# Save Test Report
test_figs = generate_report(all_labels, all_preds, "Test")
test_pdf_path = f"evaluation_result/test_report_{model_name}.pdf"
with PdfPages(test_pdf_path) as pdf:
    for fig in test_figs:
        pdf.savefig(fig)
        plt.close(fig)
print(f"[✅] Test report saved to: {test_pdf_path}")

# ────────────────────────────────────────
# Save Validation Report
val_figs = generate_report(val_labels, val_preds, "Validation")
val_pdf_path = f"evaluation_result/val_report_{model_name}.pdf"
with PdfPages(val_pdf_path) as pdf:
    for fig in val_figs:
        pdf.savefig(fig)
        plt.close(fig)
print(f"[✅] Validation report saved to: {val_pdf_path}")


[✅] Test report saved to: evaluation_result/test_report_deberta.pdf
[✅] Validation report saved to: evaluation_result/val_report_deberta.pdf


In [30]:

test_df = pd.read_csv(f"evaluation_result/{model_name}_predictions.csv")
all_labels = test_df["true_label"]
all_preds = test_df["predicted_label"]
class_names = ["Negative", "Neutral", "Positive"]


cm = confusion_matrix(all_labels, all_preds)

# Define improved error report path
error_pdf_path = f"evaluation_result/error_analysis_{model_name}.pdf"


# Confusion matrix
fig_err1, ax_err1 = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names, ax=ax_err1)
ax_err1.set_title("Confusion Matrix")
ax_err1.set_xlabel("Predicted Label")
ax_err1.set_ylabel("True Label")

# ─────────────────────────────
# Classification Metrics Table
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average=None, zero_division=0)
recall = recall_score(all_labels, all_preds, average=None, zero_division=0)
f1 = f1_score(all_labels, all_preds, average=None, zero_division=0)


# Metrics Table Plot
from sklearn.metrics import classification_report

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision_macro = precision_score(all_labels, all_preds, average="macro", zero_division=0)
recall_macro = recall_score(all_labels, all_preds, average="macro", zero_division=0)
f1_macro = f1_score(all_labels, all_preds, average="macro", zero_division=0)

precision_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0)
recall_per_class = recall_score(all_labels, all_preds, average=None, zero_division=0)
f1_per_class = f1_score(all_labels, all_preds, average=None, zero_division=0)

class_names = ["Negative", "Neutral", "Positive"]
# ─────────────────────────────
# Table 1: Overall Metrics
fig_overall, ax_overall = plt.subplots()
overall_metrics = [
    ["Accuracy", f"{accuracy:.2f}"],
    ["Macro Precision", f"{precision_macro:.2f}"],
    ["Macro Recall", f"{recall_macro:.2f}"],
    ["Macro F1-Score", f"{f1_macro:.2f}"],
]
ax_overall.axis("off")
table1 = ax_overall.table(
    cellText=overall_metrics,
    colLabels=["Metric", "Score"],
    loc="center",
    cellLoc="center"
)
table1.auto_set_font_size(False)
table1.set_fontsize(10)
ax_overall.set_title("Overall Classification Metrics", fontweight="bold", pad=20)

# ─────────────────────────────
# Table 2: Class-wise Metrics
fig_classwise, ax_classwise = plt.subplots()
class_metrics = [
    [class_names[i], f"{precision_per_class[i]:.2f}", f"{recall_per_class[i]:.2f}", f"{f1_per_class[i]:.2f}"]
    for i in range(3)
]
ax_classwise.axis("off")
table2 = ax_classwise.table(
    cellText=class_metrics,
    colLabels=["Class", "Precision", "Recall", "F1-Score"],
    loc="center",
    cellLoc="center"
)
table2.auto_set_font_size(False)
table2.set_fontsize(10)
ax_classwise.set_title("Per-Class Metrics", fontweight="bold", pad=20)



# Correct vs Incorrect bar chart
fig_err2, ax_err2 = plt.subplots()
correct_counts = test_df["correct"].value_counts().sort_index()
correct_labels = ["Incorrect", "Correct"]
bars = ax_err2.bar(correct_labels, correct_counts, color=["#d62728", "#2ca02c"])
ax_err2.set_title("Correct vs Incorrect Predictions")
for bar in bars:
    height = bar.get_height()
    ax_err2.annotate(f"{height} ({(height/len(test_df)*100):.1f}%)",
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),
                     textcoords="offset points",
                     ha='center', va='bottom')

# Sample misclassified entries (with ID)
misclassified_df = test_df[test_df["correct"] == False].copy()
misclassified_df = misclassified_df.reset_index().rename(columns={"index": "id"})
misclassified_df["true_class"] = misclassified_df["true_label"].map({0: "Negative", 1: "Neutral", 2: "Positive"})
misclassified_df["pred_class"] = misclassified_df["predicted_label"].map({0: "Negative", 1: "Neutral", 2: "Positive"})
top_errors = misclassified_df.sample(n=min(10, len(misclassified_df)), random_state=42)

fig_err3, ax_err3 = plt.subplots(figsize=(10, 6))
ax_err3.axis("off")
error_table = ax_err3.table(
    cellText=top_errors[["id", "text", "true_class", "pred_class", "confidence"]].values,
    colLabels=["ID", "Text", "True", "Predicted", "Confidence"],
    cellLoc='left',
    loc='center'
)
error_table.auto_set_font_size(False)
error_table.set_fontsize(8)
error_table.scale(1, 1.4)
ax_err3.set_title("Sample Misclassified Examples")

# Add confidence bands
bins = np.linspace(0, 1, 6)  # e.g., [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ["0–0.2", "0.2–0.4", "0.4–0.6", "0.6–0.8", "0.8–1.0"]
test_df["confidence_band"] = pd.cut(test_df["confidence"], bins=bins, labels=labels, include_lowest=True)

# Group by band and correctness
band_stats = test_df.groupby(["confidence_band", "correct"]).size().unstack(fill_value=0)

# Plot confidence band breakdown
fig_err4, ax_err4 = plt.subplots()
band_stats.plot(kind="bar", stacked=True, color=["#d62728", "#2ca02c"], ax=ax_err4)
ax_err4.set_title("Correct vs Incorrect Predictions by Confidence Band")
ax_err4.set_xlabel("Confidence Band")
ax_err4.set_ylabel("Number of Predictions")
ax_err4.legend(["Incorrect", "Correct"])

# Save all to PDF
with PdfPages(error_pdf_path) as pdf:
    pdf.savefig(fig_err1); plt.close(fig_err1)
    pdf.savefig(fig_overall); plt.close(fig_overall)  
    pdf.savefig(fig_classwise); plt.close(fig_classwise)
    pdf.savefig(fig_err2); plt.close(fig_err2)
    pdf.savefig(fig_err3); plt.close(fig_err3)
    pdf.savefig(fig_err4); plt.close(fig_err4)
    

print(f"[🛠️] Improved error analysis report saved to {error_pdf_path}")


  band_stats = test_df.groupby(["confidence_band", "correct"]).size().unstack(fill_value=0)


[🛠️] Improved error analysis report saved to evaluation_result/error_analysis_deberta.pdf
