In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import torch
import random
import joblib
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier

# 📥 Load and clean data
df = pd.read_csv("/content/sample_data/updated_data_with_points.csv")
df = df.dropna(subset=["Answer", "Assigned Points"])

# 📝 Combine text fields
df["input_text"] = (
    df["Question"].fillna("NO_QUESTION") + " [SEP] " +
    df["Answer Choices"].fillna("NO_CHOICES") + " [SEP] " +
    df["Answer"].fillna("NO_ANSWER")
)

# 📊 Aggregate by student
student_texts = df.groupby("Student_Id")["input_text"].apply(lambda x: " ".join(x)).reset_index()
student_scores = df.groupby("Student_Id")["Assigned Points"].sum().reset_index()
student_scores.columns = ["Student_Id", "Total_Score"]

# 🎓 Convert scores to grades
def score_to_grade(p, max_score=25.45):
    percent = (p / max_score) * 100
    if percent >= 90:
        return "A"
    elif percent >= 80:
        return "B"
    elif percent >= 70:
        return "C"
    elif percent >= 60:
        return "D"
    else:
        return "F"

student_scores["Grade"] = student_scores["Total_Score"].apply(score_to_grade)
data = pd.merge(student_texts, student_scores, on="Student_Id")

# 🔁 Balance the dataset with duplication and variation
def slightly_modify_text(text):
    words = text.split()
    if len(words) > 5:
        random.shuffle(words)
    return " ".join(words)

def balance_classes(data, label_col="Grade"):
    max_count = data[label_col].value_counts().max()
    balanced_data = []
    for label, group in data.groupby(label_col):
        group = group.copy()
        while len(group) < max_count:
            extra = group.sample(n=min(len(group), max_count - len(group)), replace=True)
            extra["input_text"] = extra["input_text"].apply(slightly_modify_text)
            group = pd.concat([group, extra], ignore_index=True)
        balanced_data.append(group)
    return pd.concat(balanced_data, ignore_index=True)

data = balance_classes(data, label_col="Grade")

# 🔀 Stratified split
y = data["Grade"]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train_texts, X_test_texts, y_train_enc, y_test_enc = train_test_split(
    data["input_text"], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 🤖 Load CodeBERT
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# 🔢 Embedding function
def get_embeddings(texts, batch_size=8):
    all_embeddings = []
    device = "cuda" if torch.cuda.is_available() else "cpu"
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            outputs = model(**inputs)
            pooled = outputs.pooler_output.cpu().numpy()
            all_embeddings.append(pooled)
    return np.vstack(all_embeddings)

# 💡 Generate embeddings
X_train_emb = get_embeddings(X_train_texts.tolist())
X_test_emb = get_embeddings(X_test_texts.tolist())

# 🚀 Train XGBoost Classifier
clf = XGBClassifier(n_estimators=300, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
clf.fit(X_train_emb, y_train_enc)

# 💾 Save model and label encoder
joblib.dump(clf, "/content/sample_data/xgboost_grade_model.pkl")
joblib.dump(label_encoder, "/content/sample_data/label_encoder.pkl")

# 🎯 Evaluate
y_pred_enc = clf.predict(X_test_emb)
y_test = label_encoder.inverse_transform(y_test_enc)
y_pred = label_encoder.inverse_transform(y_pred_enc)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 📉 Confusion Matrix
labels = label_encoder.classes_
cm = confusion_matrix(y_test, y_pred, labels=labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Grades)")
plt.tight_layout()
plt.show()
