In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import BertTokenizer, BertModel, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader,  WeightedRandomSampler
import torch
import torch.nn as nn
from tqdm import tqdm
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
from preprocessing_pipeline import preprocess_dataset
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# pipeline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# Load into a pandas dataframe
file_path = 'assignment_3_ai_tutors_dataset.json'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset, test_dataset, tokenizer, df = preprocess_dataset(file_path,tokenizer, is_leniant=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def get_class_weights(dataset, task_name):
    # Extract labels as a flat list of ints
    labels = [int(sample[f"{task_name}_label"]) for sample in dataset]

    # Get unique classes from the dataset
    unique_classes = np.unique(labels)

    # Compute class weights
    class_weights = compute_class_weight(class_weight="balanced", classes=unique_classes, y=labels)

    return torch.tensor(class_weights, dtype=torch.float)

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction ='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        if alpha is not None:
            self.alpha = torch.tensor(alpha, dtype=torch.float).to(device)
        else:
            self.alpha = None

    def forward(self, logits, targets):
        F = nn.functional
        ce_loss = F.cross_entropy(logits, targets, reduction=self.reduction, weight=self.alpha)
        pt = torch.exp(-ce_loss)
        loss = ((1 - pt) ** self.gamma) * ce_loss
        return loss.mean()

# def compute_metrics(preds, labels):
#     acc = accuracy_score(labels, preds)
#     f1 = f1_score(labels, preds, average='macro')
#     return acc, f1

# Bert

In [None]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name='bert-base-uncased', hidden_size = 512, num_labels=3):
        super(MultiTaskBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        h = self.bert.config.hidden_size
        self.h1 = nn.Linear(h, hidden_size)
        self.h2 = nn.Linear(h, hidden_size)
        self.h3 = nn.Linear(h, hidden_size)
        self.h4 = nn.Linear(h, hidden_size)

        # Each task gets its own classifier head
        self.classifier_mi = nn.Linear(hidden_size, num_labels)
        self.classifier_ml = nn.Linear(hidden_size, num_labels)
        self.classifier_pg = nn.Linear(hidden_size, num_labels)
        self.classifier_act = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        F = nn.functional
        im1 = F.relu(self.h1(pooled_output))
        im2 = F.relu(self.h2(pooled_output))
        im3 = F.relu(self.h3(pooled_output))
        im4 = F.relu(self.h4(pooled_output))

        mi_logits = F.sigmoid(self.classifier_mi(im1))
        ml_logits = F.sigmoid(self.classifier_ml(im2))
        pg_logits = F.sigmoid(self.classifier_pg(im3))
        act_logits = F.sigmoid(self.classifier_act(im4))

        return {
            "Mistake_Identification": mi_logits,
            "Mistake_Location": ml_logits,
            "Pedagogical_Guidance": pg_logits,
            "Actionability": act_logits
        }

In [None]:
def train(model, train_loader,optimizer,loss_fun_dict,epochs=10):
    model.to(device)

    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name = 'linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            labels = {
                task: batch[f"{task}_label"].to(device) for task in [
                    "Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]
            }

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = 0
            for task in outputs:
                logits = outputs[task]
                task_labels = labels[task]
                loss += loss_fun_dict[task](logits, task_labels)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")


def evaluate(model, val_loader):
    model.eval()
    preds = {task: [] for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]}
    labels = {task: [] for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]}

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            batch_labels = {
                task: batch[f"{task}_label"].cpu().numpy() for task in preds
            }

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            for task in preds:
                logits = outputs[task].detach().cpu()
                task_preds = torch.argmax(logits, dim=1).numpy()
                preds[task].extend(task_preds)
                labels[task].extend(batch_labels[task])

    print("\nEvaluation Metrics (Exact):")
    for task in preds:
        print(f"\n{task}:")
        print(classification_report(labels[task], preds[task], target_names=["Yes", "To some extent", "No"]))

In [None]:
model1 = MultiTaskBERT()
optimizer = AdamW(model1.parameters(), lr=2e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# alphas_mi = get_class_weights(train_dataset, "Mistake_Identification")
# alphas_ml = get_class_weights(train_dataset, "Mistake_Location")
# alphas_pg = get_class_weights(train_dataset, "Pedagogical_Guidance")
# alphas_act = get_class_weights(train_dataset, "Actionability")

loss_fn_dict = {
    "Mistake_Identification": FocalLoss(reduction="mean", gamma=1.7),
    "Mistake_Location": FocalLoss(reduction="mean", gamma = 1.7),
    "Pedagogical_Guidance": FocalLoss(reduction="mean", gamma = 1.7),
    "Actionability": FocalLoss(reduction="mean", gamma = 1.7)
}

In [None]:
train(model1, train_loader, epochs=10, optimizer=optimizer, loss_fun_dict=loss_fn_dict)

Training Epoch 1: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 1 Loss: 1.6787


Training Epoch 2: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 2 Loss: 1.5558


Training Epoch 3: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 3 Loss: 1.5109


Training Epoch 4: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 4 Loss: 1.4742


Training Epoch 5: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 5 Loss: 1.4364


Training Epoch 6: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 6 Loss: 1.4177


Training Epoch 7: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 7 Loss: 1.3960


Training Epoch 8: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 8 Loss: 1.3775


Training Epoch 9: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]


Epoch 9 Loss: 1.3664


Training Epoch 10: 100%|██████████| 140/140 [01:33<00:00,  1.50it/s]

Epoch 10 Loss: 1.3611





In [None]:
## Focal loss without alphas on exact
val_loader = DataLoader(test_dataset, batch_size=8)
results = compute_metrics(model1, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7863, Macro F1 = 0.3783
Mistake_Location: Accuracy = 0.625, Macro F1 = 0.3992
Pedagogical_Guidance: Accuracy = 0.5847, Macro F1 = 0.447
Actionability: Accuracy = 0.5887, Macro F1 = 0.3939


In [None]:
torch.save(model1.state_dict(), 'bert_with_focal_loss_noalpha_lenient.pt')

In [None]:
def compute_metrics(model, test_loader, device="cuda"):
    model.eval()
    tasks = ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]
    preds = {task: [] for task in tasks}
    labels = {task: [] for task in tasks}

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Move labels to CPU for metric calculation
            batch_labels = {task: batch[f"{task}_label"].cpu().numpy() for task in tasks}
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            for task in tasks:
                logits = outputs[task].detach().cpu()
                task_preds = torch.argmax(logits, dim=1).numpy()
                preds[task].extend(task_preds)
                labels[task].extend(batch_labels[task])

    # Compute accuracy and macro F1 for each task
    results = {}
    for task in tasks:
        y_true = labels[task]
        y_pred = preds[task]
        accuracy = accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average="macro")
        results[task] = {
            "accuracy": round(accuracy, 4),
            "macro_f1": round(macro_f1, 4)
        }

    return results

In [None]:
## Focal loss with alphas on lenient
val_loader = DataLoader(test_dataset, batch_size=8)
results = compute_metrics(model1, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.625, Macro F1 = 0.5267
Mistake_Location: Accuracy = 0.6411, Macro F1 = 0.6164
Pedagogical_Guidance: Accuracy = 0.6613, Macro F1 = 0.5856
Actionability: Accuracy = 0.6653, Macro F1 = 0.6336


In [None]:
## Focal loss without alphas on lenient
val_loader = DataLoader(test_dataset, batch_size=8)
results = compute_metrics(model1, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.8508, Macro F1 = 0.5289
Mistake_Location: Accuracy = 0.7298, Macro F1 = 0.6253
Pedagogical_Guidance: Accuracy = 0.7984, Macro F1 = 0.6029
Actionability: Accuracy = 0.7056, Macro F1 = 0.6096


In [None]:
## Focal loss without alphas on extreme
val_loader = DataLoader(test_dataset, batch_size=8)
results = compute_metrics(model1, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7863, Macro F1 = 0.3783
Mistake_Location: Accuracy = 0.6371, Macro F1 = 0.4025
Pedagogical_Guidance: Accuracy = 0.5565, Macro F1 = 0.3584
Actionability: Accuracy = 0.5, Macro F1 = 0.2762


In [None]:
model2 = MultiTaskBERT()
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model2.parameters(), lr=2e-5)
loss_fn_dict1 = {
    "Mistake_Identification": nn.CrossEntropyLoss(reduction="mean"),
    "Mistake_Location": nn.CrossEntropyLoss(reduction="mean"),
    "Pedagogical_Guidance": nn.CrossEntropyLoss(reduction="mean"),
    "Actionability": nn.CrossEntropyLoss(reduction="mean")
}

In [None]:
train(model2, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict1)

Training Epoch 1: 100%|██████████| 140/140 [01:30<00:00,  1.56it/s]


Epoch 1 Loss: 3.5487


Training Epoch 2: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 2 Loss: 3.4101


Training Epoch 3: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 3 Loss: 3.3107


Training Epoch 4: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 4 Loss: 3.2372


Training Epoch 5: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 5 Loss: 3.1410


Training Epoch 6: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 6 Loss: 3.0383


Training Epoch 7: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 7 Loss: 2.9573


Training Epoch 8: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 8 Loss: 2.8805


Training Epoch 9: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]


Epoch 9 Loss: 2.8330


Training Epoch 10: 100%|██████████| 140/140 [01:31<00:00,  1.53it/s]

Epoch 10 Loss: 2.7667





In [None]:
torch.save(model2.state_dict(), "bert_with_BCE.pt")

In [None]:
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model2, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.8024, Macro F1 = 0.4913
Mistake_Location: Accuracy = 0.621, Macro F1 = 0.4056
Pedagogical_Guidance: Accuracy = 0.5605, Macro F1 = 0.4409
Actionability: Accuracy = 0.5726, Macro F1 = 0.3977


In [None]:
model3 = MultiTaskBERT(num_labels = 2)
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model3.parameters(), lr=2e-5)
loss_fn_dict1 = {
    "Mistake_Identification": nn.CrossEntropyLoss(reduction="mean"),
    "Mistake_Location": nn.CrossEntropyLoss(reduction="mean"),
    "Pedagogical_Guidance": nn.CrossEntropyLoss(reduction="mean"),
    "Actionability": nn.CrossEntropyLoss(reduction="mean")
}

In [None]:
train(model3, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict1)

Training Epoch 1: 100%|██████████| 140/140 [01:39<00:00,  1.40it/s]


Epoch 1 Loss: 2.2220


Training Epoch 2: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 2 Loss: 2.1453


Training Epoch 3: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 3 Loss: 2.0593


Training Epoch 4: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 4 Loss: 1.9873


Training Epoch 5: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 5 Loss: 1.9371


Training Epoch 6: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 6 Loss: 1.8907


Training Epoch 7: 100%|██████████| 140/140 [01:40<00:00,  1.40it/s]


Epoch 7 Loss: 1.8599


Training Epoch 8: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 8 Loss: 1.8358


Training Epoch 9: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]


Epoch 9 Loss: 1.7933


Training Epoch 10: 100%|██████████| 140/140 [01:40<00:00,  1.39it/s]

Epoch 10 Loss: 1.7931





In [None]:
torch.save(model3.state_dict(), "bert_with_BCE_lenient.pt")

In [None]:
## BCE_lenient
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model3, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.871, Macro F1 = 0.6166
Mistake_Location: Accuracy = 0.7258, Macro F1 = 0.6456
Pedagogical_Guidance: Accuracy = 0.8024, Macro F1 = 0.6385
Actionability: Accuracy = 0.7218, Macro F1 = 0.6521


In [None]:
# num_labels = 2
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model3, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.8468, Macro F1 = 0.4585
Mistake_Location: Accuracy = 0.7137, Macro F1 = 0.5052
Pedagogical_Guidance: Accuracy = 0.7863, Macro F1 = 0.5644
Actionability: Accuracy = 0.7137, Macro F1 = 0.6075


# Roberta

In [None]:
from transformers import RobertaModel, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_dataset, test_dataset, tokenizer, df = preprocess_dataset(file_path, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
class MultiTaskRoBERTa(nn.Module):
    def __init__(self, model_name='roberta-base', hidden_size = 512 ,num_labels=3):
        super(MultiTaskRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        h = self.roberta.config.hidden_size
        self.h1 = nn.Linear(h, hidden_size)
        self.h2 = nn.Linear(h, hidden_size)
        self.h3 = nn.Linear(h, hidden_size)
        self.h4 = nn.Linear(h, hidden_size)

        # Each task gets its own classifier head
        self.classifier_mi = nn.Linear(hidden_size, num_labels)
        self.classifier_ml = nn.Linear(hidden_size, num_labels)
        self.classifier_pg = nn.Linear(hidden_size, num_labels)
        self.classifier_act = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.last_hidden_state[:, 0, :])
        F = nn.functional
        im1 = F.relu(self.h1(pooled_output))
        im2 = F.relu(self.h2(pooled_output))
        im3 = F.relu(self.h3(pooled_output))
        im4 = F.relu(self.h4(pooled_output))

        mi_logits = F.sigmoid(self.classifier_mi(im1))
        ml_logits = F.sigmoid(self.classifier_ml(im2))
        pg_logits = F.sigmoid(self.classifier_pg(im3))
        act_logits = F.sigmoid(self.classifier_act(im3))

        return {
            "Mistake_Identification": mi_logits,
            "Mistake_Location": ml_logits,
            "Pedagogical_Guidance": pg_logits,
            "Actionability": act_logits
        }

In [None]:
model4 = MultiTaskRoBERTa(num_labels = 3)
# train_loader1 = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer1 = AdamW(model4.parameters(), lr=2e-5)
loss_fn_dict1 = {
    "Mistake_Identification": nn.CrossEntropyLoss(reduction="mean"),
    "Mistake_Location": nn.CrossEntropyLoss(reduction="mean"),
    "Pedagogical_Guidance": nn.CrossEntropyLoss(reduction="mean"),
    "Actionability": nn.CrossEntropyLoss(reduction="mean")
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train(model4, train_loader, epochs=10, optimizer=optimizer1, loss_fun_dict=loss_fn_dict1)

Training Epoch 1: 100%|██████████| 140/140 [01:25<00:00,  1.64it/s]


Epoch 1 Loss: 3.7877


Training Epoch 2: 100%|██████████| 140/140 [01:27<00:00,  1.60it/s]


Epoch 2 Loss: 3.6371


Training Epoch 3: 100%|██████████| 140/140 [01:28<00:00,  1.59it/s]


Epoch 3 Loss: 3.5788


Training Epoch 4: 100%|██████████| 140/140 [01:28<00:00,  1.58it/s]


Epoch 4 Loss: 3.5309


Training Epoch 5: 100%|██████████| 140/140 [01:29<00:00,  1.57it/s]


Epoch 5 Loss: 3.4946


Training Epoch 6: 100%|██████████| 140/140 [01:29<00:00,  1.56it/s]


Epoch 6 Loss: 3.4807


Training Epoch 7: 100%|██████████| 140/140 [01:29<00:00,  1.57it/s]


Epoch 7 Loss: 3.4412


Training Epoch 8: 100%|██████████| 140/140 [01:28<00:00,  1.57it/s]


Epoch 8 Loss: 3.4086


Training Epoch 9: 100%|██████████| 140/140 [01:28<00:00,  1.57it/s]


Epoch 9 Loss: 3.3900


Training Epoch 10: 100%|██████████| 140/140 [01:29<00:00,  1.56it/s]

Epoch 10 Loss: 3.3695





In [None]:
## Roberta with CE no alpha exact
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model4, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7903, Macro F1 = 0.3988
Mistake_Location: Accuracy = 0.6371, Macro F1 = 0.407
Pedagogical_Guidance: Accuracy = 0.5685, Macro F1 = 0.4358
Actionability: Accuracy = 0.5685, Macro F1 = 0.3864


In [None]:
model5 = MultiTaskRoBERTa(num_labels = 3)
optimizer = AdamW(model5.parameters(), lr=2e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# alphas_mi = get_class_weights(train_dataset, "Mistake_Identification")
# alphas_ml = get_class_weights(train_dataset, "Mistake_Location")
# alphas_pg = get_class_weights(train_dataset, "Pedagogical_Guidance")
# alphas_act = get_class_weights(train_dataset, "Actionability")

loss_fn_dict = {
    "Mistake_Identification": FocalLoss(reduction="mean", gamma=1.7),
    "Mistake_Location": FocalLoss(reduction="mean", gamma = 1.7),
    "Pedagogical_Guidance": FocalLoss(reduction="mean", gamma = 1.7),
    "Actionability": FocalLoss(reduction="mean", gamma = 1.7)
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train(model5, train_loader, epochs=10, optimizer=optimizer, loss_fun_dict=loss_fn_dict)

Training Epoch 1: 100%|██████████| 140/140 [01:29<00:00,  1.56it/s]


Epoch 1 Loss: 1.6746


Training Epoch 2: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 2 Loss: 1.5512


Training Epoch 3: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 3 Loss: 1.4892


Training Epoch 4: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 4 Loss: 1.4461


Training Epoch 5: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 5 Loss: 1.4120


Training Epoch 6: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 6 Loss: 1.3787


Training Epoch 7: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]


Epoch 7 Loss: 1.3588


Training Epoch 8: 100%|██████████| 140/140 [01:34<00:00,  1.49it/s]


Epoch 8 Loss: 1.3350


Training Epoch 9: 100%|██████████| 140/140 [01:34<00:00,  1.49it/s]


Epoch 9 Loss: 1.3148


Training Epoch 10: 100%|██████████| 140/140 [01:33<00:00,  1.49it/s]

Epoch 10 Loss: 1.3024





In [None]:
## Roberta with FL no alpha exact
val_loader = DataLoader(test_dataset, batch_size=16)
results = compute_metrics(model5, val_loader, device)
for task, metrics in results.items():
    print(f"{task}: Accuracy = {metrics['accuracy']}, Macro F1 = {metrics['macro_f1']}")

Mistake_Identification: Accuracy = 0.7944, Macro F1 = 0.4711
Mistake_Location: Accuracy = 0.6653, Macro F1 = 0.4289
Pedagogical_Guidance: Accuracy = 0.6089, Macro F1 = 0.519
Actionability: Accuracy = 0.5726, Macro F1 = 0.3855
