In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import BertTokenizer, BertModel, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader,  WeightedRandomSampler
import torch
import torch.nn as nn
from tqdm import tqdm
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import preprocessing_pipeline as prep

# pipeline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# Load into a pandas dataframe
file_path = 'assignment_3_ai_tutors_dataset.json'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset, test_dataset, tokenizer, df = prep.preprocess_dataset(file_path,tokenizer, is_leniant=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def get_class_weights(dataset, task_name):
    # Extract labels as a flat list of ints
    labels = [int(sample[f"{task_name}_label"]) for sample in dataset]

    # Get unique classes from the dataset
    unique_classes = np.unique(labels)

    # Compute class weights
    class_weights = compute_class_weight(class_weight="balanced", classes=unique_classes, y=labels)

    return torch.tensor(class_weights, dtype=torch.float)

# class FocalLoss(nn.Module):
#     def __init__(self, alpha=None, gamma=2.0, reduction ='mean'):
#         super(FocalLoss, self).__init__()
#         self.alpha = alpha
#         self.gamma = gamma
#         self.reduction = reduction
#         if alpha is not None:
#             self.alpha = torch.tensor(alpha, dtype=torch.float).to(device)
#         else:
#             self.alpha = None

#     def forward(self, logits, targets):
#         F = nn.functional
#         ce_loss = F.cross_entropy(logits, targets, reduction=self.reduction, weight=self.alpha)
#         pt = torch.exp(-ce_loss)
#         loss = ((1 - pt) ** self.gamma) * ce_loss
#         return loss.mean()

# def compute_metrics(preds, labels):
#     acc = accuracy_score(labels, preds)
#     f1 = f1_score(labels, preds, average='macro')
#     return acc, f1

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1, weight=None):
        super().__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        self.weight = weight

    def forward(self, logits, target):
        log_probs = nn.functional.log_softmax(logits, dim=-1)
        if self.weight is not None:
            log_probs = log_probs * self.weight.to(log_probs.device)
        nll_loss = -log_probs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
        smooth_loss = -log_probs.mean(dim=-1)
        return (self.confidence * nll_loss + self.smoothing * smooth_loss).mean()

# Bert

In [None]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size = 512, num_labels=3):
        super(MultiTaskBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        h = self.bert.config.hidden_size
        self.h1 = nn.Linear(h, hidden_size)
        self.h2 = nn.Linear(h, hidden_size)
        self.h3 = nn.Linear(h, hidden_size)
        self.h4 = nn.Linear(h, hidden_size)

        # Each task gets its own classifier head
        self.classifier_mi = nn.Linear(hidden_size, num_labels)
        self.classifier_ml = nn.Linear(hidden_size, num_labels)
        self.classifier_pg = nn.Linear(hidden_size, num_labels)
        self.classifier_act = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        F = nn.functional
        im1 = F.relu(self.h1(pooled_output))
        im2 = F.relu(self.h2(pooled_output))
        im3 = F.relu(self.h3(pooled_output))
        im4 = F.relu(self.h4(pooled_output))

        mi_logits = self.classifier_mi(im1)
        ml_logits = self.classifier_ml(im2)
        pg_logits = self.classifier_pg(im3)
        act_logits = self.classifier_act(im4)

        return {
            "Mistake_Identification": mi_logits,
            "Mistake_Location": ml_logits,
            "Pedagogical_Guidance": pg_logits,
            "Actionability": act_logits
        }

In [None]:
def train(model, train_loader,optimizer,loss_fun_dict,epochs=10):
    model.to(device)

    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name = 'linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            labels = {
                task: batch[f"{task}_label"].to(device) for task in [
                    "Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]
            }

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = 0
            for task in outputs:
                logits = outputs[task]
                task_labels = labels[task]
                loss += loss_fun_dict[task](logits, task_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")


def evaluate(model, val_loader):
    model.eval()
    preds = {task: [] for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]}
    labels = {task: [] for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]}

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            batch_labels = {
                task: batch[f"{task}_label"].cpu().numpy() for task in preds
            }

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            for task in preds:
                logits = outputs[task].detach().cpu()
                task_preds = torch.argmax(logits, dim=1).numpy()
                preds[task].extend(task_preds)
                labels[task].extend(batch_labels[task])

    print("\nEvaluation Metrics (Exact):")
    for task in preds:
        print(f"\n{task}:")
        print(classification_report(labels[task], preds[task], target_names=["Yes", "To some extent", "No"]))

In [None]:
model1 = MultiTaskBERT(num_labels = 2)
optimizer = AdamW(model1.parameters(), lr=2e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# alphas_mi = get_class_weights(train_dataset, "Mistake_Identification")
# alphas_ml = get_class_weights(train_dataset, "Mistake_Location")
# alphas_pg = get_class_weights(train_dataset, "Pedagogical_Guidance")
# alphas_act = get_class_weights(train_dataset, "Actionability")

loss_fn_dict = {
    "Mistake_Identification": LabelSmoothingCrossEntropy(),
    "Mistake_Location": LabelSmoothingCrossEntropy(),
    "Pedagogical_Guidance": LabelSmoothingCrossEntropy(),
    "Actionability": LabelSmoothingCrossEntropy()
}

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
train(model1, train_loader, epochs=10, optimizer=optimizer, loss_fun_dict=loss_fn_dict)

Training Epoch 1: 100%|██████████| 140/140 [01:27<00:00,  1.59it/s]


Epoch 1 Loss: 2.3127


Training Epoch 2: 100%|██████████| 140/140 [01:32<00:00,  1.51it/s]


Epoch 2 Loss: 2.2758


Training Epoch 3: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 3 Loss: 2.2359


Training Epoch 4: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 4 Loss: 2.1803


Training Epoch 5: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 5 Loss: 2.1282


Training Epoch 6: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 6 Loss: 2.0757


Training Epoch 7: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 7 Loss: 2.0244


Training Epoch 8: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 8 Loss: 1.9719


Training Epoch 9: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 9 Loss: 1.9548


Training Epoch 10: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]

Epoch 10 Loss: 1.9263





In [None]:
def compute_metrics(model, test_loader, device="cuda"):
    model.eval()
    tasks = ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]
    preds = {task: [] for task in tasks}
    labels = {task: [] for task in tasks}

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Move labels to CPU for metric calculation
            batch_labels = {task: batch[f"{task}_label"].cpu().numpy() for task in tasks}
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            for task in tasks:
                logits = outputs[task].detach().cpu()
                task_preds = torch.argmax(logits, dim=1).numpy()
                preds[task].extend(task_preds)
                labels[task].extend(batch_labels[task])

    # Compute accuracy and macro F1 for each task
    results = {}
    for task in tasks:
        y_true = labels[task]
        y_pred = preds[task]
        accuracy = accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average="macro")
        results[task] = {
            "accuracy": round(accuracy, 4),
            "macro_f1": round(macro_f1, 4)
        }

    return results

In [None]:
val_loader = DataLoader(test_dataset, batch_size=8)
results = compute_metrics(model1, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.871, Macro F1 = 0.601
Mistake_Location: Accuracy = 0.7419, Macro F1 = 0.631
Pedagogical_Guidance: Accuracy = 0.7903, Macro F1 = 0.596
Actionability: Accuracy = 0.7218, Macro F1 = 0.6141


In [None]:
model2 = MultiTaskBERT()
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model2.parameters(), lr=2e-5)
alphas_mi = get_class_weights(train_dataset, "Mistake_Identification")
alphas_ml = get_class_weights(train_dataset, "Mistake_Location")
alphas_pg = get_class_weights(train_dataset, "Pedagogical_Guidance")
alphas_act = get_class_weights(train_dataset, "Actionability")
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
loss_fn_dict1 = {
    "Mistake_Identification": nn.CrossEntropyLoss(weight=alphas_mi.to(device), reduction="mean"),
    "Mistake_Location": nn.CrossEntropyLoss(weight=alphas_ml.to(device), reduction="mean"),
    "Pedagogical_Guidance": nn.CrossEntropyLoss(weight=alphas_pg.to(device), reduction="mean"),
    "Actionability": nn.CrossEntropyLoss(weight=alphas_act.to(device), reduction="mean")
}

In [None]:
train(model2, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict1)

Training Epoch 1: 100%|██████████| 140/140 [01:26<00:00,  1.61it/s]


Epoch 1 Loss: 4.3577


Training Epoch 2: 100%|██████████| 140/140 [01:29<00:00,  1.56it/s]


Epoch 2 Loss: 4.2916


Training Epoch 3: 100%|██████████| 140/140 [01:31<00:00,  1.52it/s]


Epoch 3 Loss: 4.1765


Training Epoch 4: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 4 Loss: 4.0856


Training Epoch 5: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 5 Loss: 4.0033


Training Epoch 6: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 6 Loss: 3.9179


Training Epoch 7: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 7 Loss: 3.8615


Training Epoch 8: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 8 Loss: 3.7718


Training Epoch 9: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]


Epoch 9 Loss: 3.7021


Training Epoch 10: 100%|██████████| 140/140 [01:32<00:00,  1.52it/s]

Epoch 10 Loss: 3.6406





In [None]:
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model2, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.5565, Macro F1 = 0.4282
Mistake_Location: Accuracy = 0.504, Macro F1 = 0.3912
Pedagogical_Guidance: Accuracy = 0.5, Macro F1 = 0.4545
Actionability: Accuracy = 0.504, Macro F1 = 0.4309


In [None]:
model3 = MultiTaskBERT(num_labels = 2)
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model3.parameters(), lr=2e-5)
alphas_mi = get_class_weights(train_dataset, "Mistake_Identification")
alphas_ml = get_class_weights(train_dataset, "Mistake_Location")
alphas_pg = get_class_weights(train_dataset, "Pedagogical_Guidance")
alphas_act = get_class_weights(train_dataset, "Actionability")
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
loss_fn_dict1 = {
    "Mistake_Identification": nn.CrossEntropyLoss(weight=alphas_mi.to(device), reduction="mean"),
    "Mistake_Location": nn.CrossEntropyLoss(weight=alphas_ml.to(device), reduction="mean"),
    "Pedagogical_Guidance": nn.CrossEntropyLoss(weight=alphas_pg.to(device), reduction="mean"),
    "Actionability": nn.CrossEntropyLoss(weight=alphas_act.to(device), reduction="mean")
}

In [None]:
train(model3, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict1)

Training Epoch 1: 100%|██████████| 140/140 [01:37<00:00,  1.43it/s]


Epoch 1 Loss: 2.7326


Training Epoch 2: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 2 Loss: 2.6116


Training Epoch 3: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 3 Loss: 2.5314


Training Epoch 4: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 4 Loss: 2.4328


Training Epoch 5: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 5 Loss: 2.3630


Training Epoch 6: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 6 Loss: 2.2355


Training Epoch 7: 100%|██████████| 140/140 [01:37<00:00,  1.44it/s]


Epoch 7 Loss: 2.1494


Training Epoch 8: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 8 Loss: 2.0715


Training Epoch 9: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 9 Loss: 1.9910


Training Epoch 10: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]

Epoch 10 Loss: 1.9650





In [None]:
torch.save(model3.state_dict(), "bert_with_BCE_weights_lenient.pt")

In [None]:
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model3, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7339, Macro F1 = 0.5985
Mistake_Location: Accuracy = 0.6653, Macro F1 = 0.6372
Pedagogical_Guidance: Accuracy = 0.7016, Macro F1 = 0.6022
Actionability: Accuracy = 0.6815, Macro F1 = 0.653


# Roberta

In [None]:
from transformers import RobertaModel, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset, test_dataset, tokenizer, df = prep.preprocess_dataset(file_path, tokenizer)

In [None]:
class MultiTaskRoBERTa(nn.Module):
    def __init__(self, model_name='roberta-base', hidden_size = 512 ,num_labels=3):
        super(MultiTaskRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        h = self.roberta.config.hidden_size
        self.h1 = nn.Linear(h, hidden_size)
        self.h2 = nn.Linear(h, hidden_size)
        self.h3 = nn.Linear(h, hidden_size)
        self.h4 = nn.Linear(h, hidden_size)

        # Each task gets its own classifier head
        self.classifier_mi = nn.Linear(hidden_size, num_labels)
        self.classifier_ml = nn.Linear(hidden_size, num_labels)
        self.classifier_pg = nn.Linear(hidden_size, num_labels)
        self.classifier_act = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.last_hidden_state[:, 0, :])
        F = nn.functional
        im1 = F.relu(self.h1(pooled_output))
        im2 = F.relu(self.h2(pooled_output))
        im3 = F.relu(self.h3(pooled_output))
        im4 = F.relu(self.h4(pooled_output))

        mi_logits = F.sigmoid(self.classifier_mi(im1))
        ml_logits = F.sigmoid(self.classifier_ml(im2))
        pg_logits = F.sigmoid(self.classifier_pg(im3))
        act_logits = F.sigmoid(self.classifier_act(im3))

        return {
            "Mistake_Identification": mi_logits,
            "Mistake_Location": ml_logits,
            "Pedagogical_Guidance": pg_logits,
            "Actionability": act_logits
        }

In [None]:
model4 = MultiTaskRoBERTa(num_labels = 3)
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model4.parameters(), lr=2e-5)
loss_fn_dict = {
    "Mistake_Identification": LabelSmoothingCrossEntropy(),
    "Mistake_Location": LabelSmoothingCrossEntropy(),
    "Pedagogical_Guidance": LabelSmoothingCrossEntropy(),
    "Actionability": LabelSmoothingCrossEntropy()
}

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train(model4, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict)

Training Epoch 1: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


Epoch 1 Loss: 3.8983


Training Epoch 2: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 2 Loss: 3.7483


Training Epoch 3: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 3 Loss: 3.6960


Training Epoch 4: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 4 Loss: 3.6620


Training Epoch 5: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 5 Loss: 3.6339


Training Epoch 6: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 6 Loss: 3.6183


Training Epoch 7: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 7 Loss: 3.5891


Training Epoch 8: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 8 Loss: 3.5712


Training Epoch 9: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 9 Loss: 3.5558


Training Epoch 10: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]

Epoch 10 Loss: 3.5505





In [None]:
## exact with roberta label smoothing
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model4, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7863, Macro F1 = 0.3805
Mistake_Location: Accuracy = 0.6331, Macro F1 = 0.3643
Pedagogical_Guidance: Accuracy = 0.5726, Macro F1 = 0.3986
Actionability: Accuracy = 0.5847, Macro F1 = 0.4012


In [None]:
train_dataset, test_dataset, tokenizer, df = prep.preprocess_dataset(file_path, tokenizer, is_leniant=True)

In [None]:
model5 = MultiTaskRoBERTa(num_labels = 2)
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model4.parameters(), lr=2e-5)
loss_fn_dict = {
    "Mistake_Identification": LabelSmoothingCrossEntropy(),
    "Mistake_Location": LabelSmoothingCrossEntropy(),
    "Pedagogical_Guidance": LabelSmoothingCrossEntropy(),
    "Actionability": LabelSmoothingCrossEntropy()
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train(model5, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict)

Training Epoch 1: 100%|██████████| 140/140 [01:34<00:00,  1.49it/s]


Epoch 1 Loss: 2.7600


Training Epoch 2: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 2 Loss: 2.7605


Training Epoch 3: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 3 Loss: 2.7609


Training Epoch 4: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 4 Loss: 2.7606


Training Epoch 5: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 5 Loss: 2.7602


Training Epoch 6: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 6 Loss: 2.7602


Training Epoch 7: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 7 Loss: 2.7610


Training Epoch 8: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 8 Loss: 2.7597


Training Epoch 9: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 9 Loss: 2.7599


Training Epoch 10: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]

Epoch 10 Loss: 2.7602





In [None]:
## Lineant with roberta label smoothing
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model5, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.8508, Macro F1 = 0.4597
Mistake_Location: Accuracy = 0.7016, Macro F1 = 0.4123
Pedagogical_Guidance: Accuracy = 0.4113, Macro F1 = 0.4013
Actionability: Accuracy = 0.6492, Macro F1 = 0.3936
