In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# 📥 Load dataset
df = pd.read_csv("../data/processed/updated_data_with_points.csv")
df = df.dropna(subset=["Answer", "Assigned Points"])

# 🎓 Grading function
def score_to_grade(p):
    if p >= 90: return "A"
    elif p >= 80: return "B"
    elif p >= 70: return "C"
    elif p >= 60: return "D"
    else: return "F"

# 🔀 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df, df["Assigned Points"], test_size=0.2, random_state=42
)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# ⚙️ Device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

# 🤖 Load CodeBERT
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name).to(device)
model.eval()

Using device: mps


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [18]:
# 🧠 Embedding function
def get_embeddings(texts, batch_size=32):
    embeddings = []
    model.eval()

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True,
                           padding=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        # CLS token: [batch_size, hidden_dim]
        cls_batch = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        embeddings.extend(cls_batch)

    return np.array(embeddings)

# ✨ Generate embeddings
X_train_emb = get_embeddings(X_train["Answer"].tolist())
X_test_emb = get_embeddings(X_test["Answer"].tolist())

KeyboardInterrupt: 

In [None]:
# 🧠 Train model
regressor = Ridge()
regressor.fit(X_train_emb, X_train["Assigned Points"])

# Save the model
import joblib
joblib.dump(regressor, "codebert_ridge_model.pkl")

# 🔮 Predict 
y_pred = regressor.predict(X_test_emb)
X_test["Predicted Score"] = y_pred

# 📊 Aggregate per student
results_df = X_test.groupby("Student_Id", as_index=False).agg({
    "Predicted Score": "sum",
    "Answer": "count"  # count = number of answers = max score
})
results_df.rename(columns={"Answer": "Answer Count"}, inplace=True)
results_df["Percent Score"] = (results_df["Predicted Score"] / results_df["Answer Count"]) * 100
results_df["Predicted Grade"] = results_df["Percent Score"].apply(score_to_grade)

# 🎯 Actual scores per student
actual_df = X_test.groupby("Student_Id", as_index=False).agg({
    "Assigned Points": "sum",
    "Answer": "count"
})
actual_df.rename(columns={"Answer": "Answer Count"}, inplace=True)
actual_df["Percent"] = (actual_df["Assigned Points"] / actual_df["Answer Count"]) * 100
actual_df["True Grade"] = actual_df["Percent"].apply(score_to_grade)

# 🔁 Merge & report
final_df = results_df.merge(actual_df[["Student_Id", "True Grade"]], on="Student_Id")
print("📋 Classification Report (Predicted vs Actual Grades):")
print(classification_report(final_df["True Grade"], final_df["Predicted Grade"]))

Using device: mps


KeyboardInterrupt: 