In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
df = pd.read_csv("mohler_dataset_edited.csv")

# Define input features and target
X = df[["question", "desired_answer", "student_answer"]]
y = df["score_avg"]

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Combine X and y for train and test sets
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Optional: print the shape to confirm
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1818, 3)
X_test shape: (455, 3)
y_train shape: (1818,)
y_test shape: (455,)


In [None]:
# Save to CSV
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(row):
    return tokenizer(
        f"question: {row['question']} reference: {row['desired_answer']} answer: {row['student_answer']}",
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class MohlerDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokens = self.tokenizer(
            f"question: {row['question']} reference: {row['desired_answer']} answer: {row['student_answer']}",
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': torch.tensor(row['score_avg'], dtype=torch.float)
        }


In [None]:
from transformers import AutoModel
import torch.nn as nn

class BERTRegressor(nn.Module):
    def __init__(self, model_name="bert-base-uncased"):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output).squeeze(-1)

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import mean_squared_error
from tqdm import tqdm


In [None]:
# Load CSVs
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Dataset & DataLoader
train_dataset = MohlerDataset(train_df, tokenizer)
test_dataset = MohlerDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BERTRegressor().to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

loss_fn = torch.nn.MSELoss()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} - Avg Train Loss: {avg_train_loss:.4f}")


  return forward_call(*args, **kwargs)
Training Epoch 1: 100%|██████████| 228/228 [02:41<00:00,  1.41it/s]



Epoch 1 - Avg Train Loss: 1.9024


Training Epoch 2: 100%|██████████| 228/228 [02:50<00:00,  1.33it/s]



Epoch 2 - Avg Train Loss: 1.0324


Training Epoch 3: 100%|██████████| 228/228 [02:50<00:00,  1.34it/s]


Epoch 3 - Avg Train Loss: 0.7278





In [None]:
model.eval()
preds = []
targets = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        preds.extend(outputs.cpu().numpy())
        targets.extend(labels.cpu().numpy())

mse = mean_squared_error(targets, preds)
print(f"Test MSE: {mse:.4f}")


Evaluating: 100%|██████████| 57/57 [00:13<00:00,  4.16it/s]

Test MSE: 0.8657





In [None]:
torch.save(model.state_dict(), "bert_asas_mohler.pt")

In [None]:
question = "What is the answer of capital doom?"
desired_answer = "The capital of doom is cat."
student_answer = "I dont give a fuck"

In [None]:
inputs = tokenizer(
    question + " [SEP] " + desired_answer + " [SEP] " + student_answer,
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)


In [None]:
with torch.no_grad():  # disable gradient tracking
    output = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

predicted_score = output.item()
print(f"Predicted Score: {predicted_score:.2f}")

Predicted Score: 2.45
