In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json


In [2]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1

Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1


In [3]:
from imblearn.over_sampling import SMOTE
import json
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer

In [4]:
# ---------- Dataset Definition ----------
class TutorEvalSingleTaskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [5]:
# ---------- Preprocessing ----------
def load_and_flatten(json_path):
    with open(json_path) as f:
        data = json.load(f)

    rows = []
    for instance in data:
        convo_id = instance["conversation_id"]
        history = instance["conversation_history"]
        for tutor_id, tutor_data in instance["tutor_responses"].items():
            row = {
                "conversation_id": convo_id,
                "tutor_id": tutor_id,
                "conversation_history": history,
                "tutor_response": tutor_data["response"],
                "Mistake_Identification": tutor_data["annotation"]["Mistake_Identification"],
                "Mistake_Location": tutor_data["annotation"]["Mistake_Location"],
                "Pedagogical_Guidance": tutor_data["annotation"]["Providing_Guidance"],
                "Actionability": tutor_data["annotation"]["Actionability"]
            }
            rows.append(row)
    return pd.DataFrame(rows)

In [6]:
def build_input_text(row):
    return f"Context:\n{row['conversation_history']}\n\nTutor Response:\n{row['tutor_response']}"

LABEL_MAP = {"Yes": 0, "To some extent": 1, "No": 2}
MERGED_LABEL_MAP = {"Yes": 1, "To some extent": 1, "No": 0}

def encode_labels(df):
    for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]:
        df[f"{task}_label"] = df[task].map(LABEL_MAP)
        df[f"{task}_binary"] = df[task].map(MERGED_LABEL_MAP)
    return df

def tokenize_inputs(tokenizer, texts, max_length=256):
    return tokenizer(
        texts,
        add_special_tokens=True,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [7]:
def preprocess_dataset(json_path, task_label, mode=None, max_length=256):
    df = load_and_flatten(json_path)
    df["input_text"] = df.apply(build_input_text, axis=1)
    df = encode_labels(df)

    train_df, val_df = train_test_split(df, test_size=0.1, stratify=df[task_label], random_state=42)

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base",do_lower_case=True)
    train_enc = tokenize_inputs(tokenizer, train_df["input_text"].tolist(), max_length=max_length)
    val_enc = tokenize_inputs(tokenizer, val_df["input_text"].tolist(), max_length=max_length)

    train_labels = torch.tensor(train_df[task_label].tolist())
    val_labels = torch.tensor(val_df[task_label].tolist())

    if mode == 'balanced':
        # Perform SMOTE oversampling
        smote = SMOTE(random_state=42)
        
        X1 = train_enc['input_ids']
        X2 = train_enc['attention_mask']
        X = torch.cat((X1, X2), dim=1)  # Combine input_ids and attention_mask for SMOTE
        y = train_labels.numpy()

        X_resampled, y_resampled = smote.fit_resample(X, y)

        # Split back input_ids and attention_mask after SMOTE
        seq_len = X1.shape[1]
        train_enc['input_ids'] = torch.tensor(X_resampled[:, :seq_len])
        train_enc['attention_mask'] = torch.tensor(X_resampled[:, seq_len:])
        train_labels = torch.tensor(y_resampled)

    train_dataset = TutorEvalSingleTaskDataset(train_enc, train_labels)
    val_dataset = TutorEvalSingleTaskDataset(val_enc, val_labels)

    return train_dataset, val_dataset, tokenizer, df

In [8]:
file_path="/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json"
df=load_and_flatten(file_path)
df.head()

Unnamed: 0,conversation_id,tutor_id,conversation_history,tutor_response,Mistake_Identification,Mistake_Location,Pedagogical_Guidance,Actionability
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Sonnet,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Yes,Yes,Yes,Yes
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama318B,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Yes,To some extent,To some extent,To some extent
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama31405B,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Yes,Yes,Yes,Yes
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,GPT4,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",Yes,Yes,Yes,Yes
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Mistral,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Yes,Yes,Yes,Yes


In [9]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

2025-04-27 12:50:51.811109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745758252.038347      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745758252.105726      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
from transformers import RobertaForSequenceClassification
import torch.nn as nn

class SingleTaskRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(SingleTaskRobertaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
# ---------- Training ----------
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"Validation Accuracy: {acc:.4f}  Validation Macro F1: {macro_f1:.4f}")
    print(classification_report(all_labels, all_preds, target_names=["Yes", "To some extent", "No"], zero_division=0))

def train_model(loss_type, train_loader, val_loader, num_labels, epochs=10):
    print(f"\n Training with: {loss_type.upper()} Loss\n")

    model = SingleTaskRobertaClassifier(num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    #train_labels_list = [label.item() for batch in train_loader for label in batch['labels']]
    #class_weights = get_class_weights(train_labels_list, num_labels)

    if loss_type == "focal":
        criterion = FocalLossWithWeights()
    elif loss_type == "smoothing":
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(weight=None)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")
        evaluate_model(model, val_loader)



In [13]:
import warnings

warnings.filterwarnings("ignore", message="Some weights of RobertaForSequenceClassification were not initialized.*")


In [14]:
from transformers import logging

logging.set_verbosity_error()  # suppress all warnings and info logs from transformers


In [15]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### TAsk 1

In [16]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Identification_label", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]


 Training with: CE Loss



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 1/10: 100%|██████████| 1304/1304 [04:31<00:00,  4.80it/s]


Epoch 1 Loss: 0.8186
Validation Accuracy: 0.7863  Validation Macro F1: 0.4659
                precision    recall  f1-score   support

           Yes       0.81      0.95      0.88       194
To some extent       0.67      0.12      0.20        17
            No       0.47      0.24      0.32        37

      accuracy                           0.79       248
     macro avg       0.65      0.44      0.47       248
  weighted avg       0.75      0.79      0.75       248



Epoch 2/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 2 Loss: 0.7246
Validation Accuracy: 0.7581  Validation Macro F1: 0.4205
                precision    recall  f1-score   support

           Yes       0.83      0.91      0.87       194
To some extent       0.20      0.06      0.09        17
            No       0.34      0.27      0.30        37

      accuracy                           0.76       248
     macro avg       0.46      0.41      0.42       248
  weighted avg       0.71      0.76      0.73       248



Epoch 3/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 3 Loss: 0.6747
Validation Accuracy: 0.7460  Validation Macro F1: 0.4593
                precision    recall  f1-score   support

           Yes       0.83      0.89      0.86       194
To some extent       0.25      0.41      0.31        17
            No       0.45      0.14      0.21        37

      accuracy                           0.75       248
     macro avg       0.51      0.48      0.46       248
  weighted avg       0.73      0.75      0.72       248



Epoch 4/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 4 Loss: 0.6376
Validation Accuracy: 0.7258  Validation Macro F1: 0.4469
                precision    recall  f1-score   support

           Yes       0.83      0.86      0.85       194
To some extent       0.17      0.24      0.20        17
            No       0.38      0.24      0.30        37

      accuracy                           0.73       248
     macro avg       0.46      0.45      0.45       248
  weighted avg       0.72      0.73      0.72       248



Epoch 5/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 5 Loss: 0.6265
Validation Accuracy: 0.7218  Validation Macro F1: 0.4215
                precision    recall  f1-score   support

           Yes       0.83      0.87      0.85       194
To some extent       0.15      0.18      0.16        17
            No       0.31      0.22      0.25        37

      accuracy                           0.72       248
     macro avg       0.43      0.42      0.42       248
  weighted avg       0.71      0.72      0.71       248



Epoch 6/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 6 Loss: 0.6188
Validation Accuracy: 0.7097  Validation Macro F1: 0.4639
                precision    recall  f1-score   support

           Yes       0.83      0.82      0.83       194
To some extent       0.21      0.29      0.24        17
            No       0.34      0.30      0.32        37

      accuracy                           0.71       248
     macro avg       0.46      0.47      0.46       248
  weighted avg       0.72      0.71      0.71       248



Epoch 7/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 7 Loss: 0.5988
Validation Accuracy: 0.7460  Validation Macro F1: 0.5079
                precision    recall  f1-score   support

           Yes       0.84      0.87      0.85       194
To some extent       0.30      0.47      0.36        17
            No       0.41      0.24      0.31        37

      accuracy                           0.75       248
     macro avg       0.52      0.53      0.51       248
  weighted avg       0.74      0.75      0.74       248



Epoch 8/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 8 Loss: 0.5873
Validation Accuracy: 0.7379  Validation Macro F1: 0.4747
                precision    recall  f1-score   support

           Yes       0.84      0.87      0.85       194
To some extent       0.20      0.29      0.24        17
            No       0.43      0.27      0.33        37

      accuracy                           0.74       248
     macro avg       0.49      0.48      0.47       248
  weighted avg       0.74      0.74      0.73       248



Epoch 9/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 9 Loss: 0.5732
Validation Accuracy: 0.7460  Validation Macro F1: 0.4644
                precision    recall  f1-score   support

           Yes       0.83      0.89      0.86       194
To some extent       0.25      0.24      0.24        17
            No       0.38      0.24      0.30        37

      accuracy                           0.75       248
     macro avg       0.48      0.46      0.46       248
  weighted avg       0.72      0.75      0.73       248



Epoch 10/10: 100%|██████████| 1304/1304 [04:37<00:00,  4.70it/s]


Epoch 10 Loss: 0.5792
Validation Accuracy: 0.7339  Validation Macro F1: 0.4993
                precision    recall  f1-score   support

           Yes       0.84      0.85      0.84       194
To some extent       0.29      0.41      0.34        17
            No       0.37      0.27      0.31        37

      accuracy                           0.73       248
     macro avg       0.50      0.51      0.50       248
  weighted avg       0.73      0.73      0.73       248



In [17]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Location_label", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=3)


 Training with: CE Loss



Epoch 1/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 1 Loss: 0.8410
Validation Accuracy: 0.4879  Validation Macro F1: 0.4038
                precision    recall  f1-score   support

           Yes       0.73      0.48      0.58       155
To some extent       0.15      0.23      0.18        22
            No       0.37      0.58      0.45        71

      accuracy                           0.49       248
     macro avg       0.42      0.43      0.40       248
  weighted avg       0.57      0.49      0.51       248



Epoch 2/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.70it/s]


Epoch 2 Loss: 0.7377
Validation Accuracy: 0.5645  Validation Macro F1: 0.4341
                precision    recall  f1-score   support

           Yes       0.69      0.66      0.68       155
To some extent       0.14      0.18      0.16        22
            No       0.46      0.46      0.46        71

      accuracy                           0.56       248
     macro avg       0.43      0.44      0.43       248
  weighted avg       0.58      0.56      0.57       248



Epoch 3/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 3 Loss: 0.6794
Validation Accuracy: 0.5645  Validation Macro F1: 0.4020
                precision    recall  f1-score   support

           Yes       0.67      0.75      0.71       155
To some extent       0.13      0.18      0.15        22
            No       0.44      0.28      0.34        71

      accuracy                           0.56       248
     macro avg       0.42      0.40      0.40       248
  weighted avg       0.56      0.56      0.55       248



Epoch 4/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 4 Loss: 0.6573
Validation Accuracy: 0.5323  Validation Macro F1: 0.4470
                precision    recall  f1-score   support

           Yes       0.71      0.60      0.65       155
To some extent       0.21      0.41      0.28        22
            No       0.41      0.42      0.41        71

      accuracy                           0.53       248
     macro avg       0.44      0.48      0.45       248
  weighted avg       0.58      0.53      0.55       248



Epoch 5/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.70it/s]


Epoch 5 Loss: 0.6318
Validation Accuracy: 0.5484  Validation Macro F1: 0.4142
                precision    recall  f1-score   support

           Yes       0.66      0.68      0.67       155
To some extent       0.19      0.18      0.19        22
            No       0.39      0.38      0.39        71

      accuracy                           0.55       248
     macro avg       0.42      0.41      0.41       248
  weighted avg       0.54      0.55      0.55       248



Epoch 6/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.70it/s]


Epoch 6 Loss: 0.6032
Validation Accuracy: 0.5282  Validation Macro F1: 0.4275
                precision    recall  f1-score   support

           Yes       0.71      0.54      0.61       155
To some extent       0.19      0.18      0.19        22
            No       0.40      0.62      0.49        71

      accuracy                           0.53       248
     macro avg       0.43      0.45      0.43       248
  weighted avg       0.57      0.53      0.54       248



Epoch 7/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 7 Loss: 0.5679
Validation Accuracy: 0.5524  Validation Macro F1: 0.4729
                precision    recall  f1-score   support

           Yes       0.73      0.60      0.66       155
To some extent       0.25      0.41      0.31        22
            No       0.41      0.49      0.45        71

      accuracy                           0.55       248
     macro avg       0.46      0.50      0.47       248
  weighted avg       0.60      0.55      0.57       248



Epoch 8/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 8 Loss: 0.5308
Validation Accuracy: 0.5847  Validation Macro F1: 0.4294
                precision    recall  f1-score   support

           Yes       0.67      0.75      0.71       155
To some extent       0.21      0.18      0.20        22
            No       0.44      0.34      0.38        71

      accuracy                           0.58       248
     macro avg       0.44      0.42      0.43       248
  weighted avg       0.56      0.58      0.57       248



Epoch 9/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.70it/s]


Epoch 9 Loss: 0.4912
Validation Accuracy: 0.5847  Validation Macro F1: 0.4564
                precision    recall  f1-score   support

           Yes       0.71      0.75      0.73       155
To some extent       0.24      0.41      0.30        22
            No       0.43      0.28      0.34        71

      accuracy                           0.58       248
     macro avg       0.46      0.48      0.46       248
  weighted avg       0.59      0.58      0.58       248



Epoch 10/10: 100%|██████████| 1041/1041 [03:41<00:00,  4.69it/s]


Epoch 10 Loss: 0.4620
Validation Accuracy: 0.5363  Validation Macro F1: 0.4354
                precision    recall  f1-score   support

           Yes       0.70      0.61      0.65       155
To some extent       0.19      0.27      0.23        22
            No       0.40      0.46      0.43        71

      accuracy                           0.54       248
     macro avg       0.43      0.45      0.44       248
  weighted avg       0.57      0.54      0.55       248



In [18]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Pedagogical_Guidance_label", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=3)


 Training with: CE Loss



Epoch 1/10: 100%|██████████| 950/950 [03:22<00:00,  4.69it/s]


Epoch 1 Loss: 0.9701
Validation Accuracy: 0.5323  Validation Macro F1: 0.4456
                precision    recall  f1-score   support

           Yes       0.62      0.68      0.65       141
To some extent       0.32      0.52      0.39        50
            No       0.91      0.18      0.29        57

      accuracy                           0.53       248
     macro avg       0.62      0.46      0.45       248
  weighted avg       0.63      0.53      0.52       248



Epoch 2/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 2 Loss: 0.8646
Validation Accuracy: 0.5927  Validation Macro F1: 0.4021
                precision    recall  f1-score   support

           Yes       0.60      0.92      0.73       141
To some extent       0.31      0.10      0.15        50
            No       0.71      0.21      0.32        57

      accuracy                           0.59       248
     macro avg       0.54      0.41      0.40       248
  weighted avg       0.57      0.59      0.52       248



Epoch 3/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 3 Loss: 0.8074
Validation Accuracy: 0.6331  Validation Macro F1: 0.5237
                precision    recall  f1-score   support

           Yes       0.66      0.86      0.75       141
To some extent       0.48      0.44      0.46        50
            No       0.70      0.25      0.36        57

      accuracy                           0.63       248
     macro avg       0.61      0.51      0.52       248
  weighted avg       0.64      0.63      0.60       248



Epoch 4/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 4 Loss: 0.7660
Validation Accuracy: 0.5766  Validation Macro F1: 0.5002
                precision    recall  f1-score   support

           Yes       0.65      0.73      0.69       141
To some extent       0.40      0.42      0.41        50
            No       0.50      0.33      0.40        57

      accuracy                           0.58       248
     macro avg       0.52      0.49      0.50       248
  weighted avg       0.57      0.58      0.57       248



Epoch 5/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 5 Loss: 0.7329
Validation Accuracy: 0.5524  Validation Macro F1: 0.4725
                precision    recall  f1-score   support

           Yes       0.64      0.71      0.67       141
To some extent       0.39      0.34      0.36        50
            No       0.43      0.35      0.38        57

      accuracy                           0.55       248
     macro avg       0.48      0.47      0.47       248
  weighted avg       0.54      0.55      0.54       248



Epoch 6/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 6 Loss: 0.7131
Validation Accuracy: 0.5605  Validation Macro F1: 0.4674
                precision    recall  f1-score   support

           Yes       0.64      0.74      0.69       141
To some extent       0.32      0.30      0.31        50
            No       0.51      0.33      0.40        57

      accuracy                           0.56       248
     macro avg       0.49      0.46      0.47       248
  weighted avg       0.55      0.56      0.55       248



Epoch 7/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 7 Loss: 0.7054
Validation Accuracy: 0.5766  Validation Macro F1: 0.4809
                precision    recall  f1-score   support

           Yes       0.65      0.76      0.70       141
To some extent       0.35      0.26      0.30        50
            No       0.49      0.40      0.44        57

      accuracy                           0.58       248
     macro avg       0.50      0.47      0.48       248
  weighted avg       0.55      0.58      0.56       248



Epoch 8/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 8 Loss: 0.6732
Validation Accuracy: 0.5565  Validation Macro F1: 0.4787
                precision    recall  f1-score   support

           Yes       0.64      0.71      0.67       141
To some extent       0.35      0.34      0.35        50
            No       0.48      0.37      0.42        57

      accuracy                           0.56       248
     macro avg       0.49      0.47      0.48       248
  weighted avg       0.55      0.56      0.55       248



Epoch 9/10: 100%|██████████| 950/950 [03:22<00:00,  4.69it/s]


Epoch 9 Loss: 0.6580
Validation Accuracy: 0.5766  Validation Macro F1: 0.4879
                precision    recall  f1-score   support

           Yes       0.65      0.75      0.70       141
To some extent       0.31      0.28      0.29        50
            No       0.56      0.40      0.47        57

      accuracy                           0.58       248
     macro avg       0.51      0.48      0.49       248
  weighted avg       0.56      0.58      0.57       248



Epoch 10/10: 100%|██████████| 950/950 [03:22<00:00,  4.70it/s]


Epoch 10 Loss: 0.6431
Validation Accuracy: 0.5927  Validation Macro F1: 0.5034
                precision    recall  f1-score   support

           Yes       0.66      0.77      0.71       141
To some extent       0.40      0.32      0.36        50
            No       0.52      0.39      0.44        57

      accuracy                           0.59       248
     macro avg       0.53      0.49      0.50       248
  weighted avg       0.57      0.59      0.58       248



In [19]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp-assg3/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Actionability_label", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=3)


 Training with: CE Loss



Epoch 1/10: 100%|██████████| 885/885 [03:08<00:00,  4.70it/s]


Epoch 1 Loss: 0.9438
Validation Accuracy: 0.5282  Validation Macro F1: 0.2304
                precision    recall  f1-score   support

           Yes       0.53      1.00      0.69       131
To some extent       0.00      0.00      0.00        37
            No       0.00      0.00      0.00        80

      accuracy                           0.53       248
     macro avg       0.18      0.33      0.23       248
  weighted avg       0.28      0.53      0.37       248



Epoch 2/10: 100%|██████████| 885/885 [03:07<00:00,  4.71it/s]


Epoch 2 Loss: 0.9232
Validation Accuracy: 0.5605  Validation Macro F1: 0.2966
                precision    recall  f1-score   support

           Yes       0.55      1.00      0.71       131
To some extent       0.00      0.00      0.00        37
            No       0.89      0.10      0.18        80

      accuracy                           0.56       248
     macro avg       0.48      0.37      0.30       248
  weighted avg       0.58      0.56      0.43       248



Epoch 3/10: 100%|██████████| 885/885 [03:07<00:00,  4.71it/s]


Epoch 3 Loss: 0.9153
Validation Accuracy: 0.5444  Validation Macro F1: 0.3170
                precision    recall  f1-score   support

           Yes       0.55      0.92      0.69       131
To some extent       0.00      0.00      0.00        37
            No       0.52      0.17      0.26        80

      accuracy                           0.54       248
     macro avg       0.36      0.37      0.32       248
  weighted avg       0.46      0.54      0.45       248



Epoch 4/10: 100%|██████████| 885/885 [03:07<00:00,  4.71it/s]


Epoch 4 Loss: 0.9093
Validation Accuracy: 0.5685  Validation Macro F1: 0.3209
                precision    recall  f1-score   support

           Yes       0.56      0.98      0.71       131
To some extent       0.00      0.00      0.00        37
            No       0.75      0.15      0.25        80

      accuracy                           0.57       248
     macro avg       0.44      0.38      0.32       248
  weighted avg       0.54      0.57      0.46       248



Epoch 5/10: 100%|██████████| 885/885 [03:08<00:00,  4.71it/s]


Epoch 5 Loss: 0.9013
Validation Accuracy: 0.6129  Validation Macro F1: 0.4150
                precision    recall  f1-score   support

           Yes       0.59      0.95      0.73       131
To some extent       0.33      0.03      0.05        37
            No       0.75      0.34      0.47        80

      accuracy                           0.61       248
     macro avg       0.56      0.44      0.41       248
  weighted avg       0.61      0.61      0.54       248



Epoch 6/10: 100%|██████████| 885/885 [03:07<00:00,  4.71it/s]


Epoch 6 Loss: 0.8509
Validation Accuracy: 0.6290  Validation Macro F1: 0.4632
                precision    recall  f1-score   support

           Yes       0.61      0.95      0.75       131
To some extent       0.50      0.11      0.18        37
            No       0.75      0.34      0.47        80

      accuracy                           0.63       248
     macro avg       0.62      0.47      0.46       248
  weighted avg       0.64      0.63      0.57       248



Epoch 7/10: 100%|██████████| 885/885 [03:08<00:00,  4.70it/s]


Epoch 7 Loss: 0.8410
Validation Accuracy: 0.5887  Validation Macro F1: 0.3443
                precision    recall  f1-score   support

           Yes       0.57      1.00      0.73       131
To some extent       0.00      0.00      0.00        37
            No       0.79      0.19      0.30        80

      accuracy                           0.59       248
     macro avg       0.45      0.40      0.34       248
  weighted avg       0.56      0.59      0.48       248



Epoch 8/10: 100%|██████████| 885/885 [03:08<00:00,  4.70it/s]


Epoch 8 Loss: 0.8145
Validation Accuracy: 0.5685  Validation Macro F1: 0.4669
                precision    recall  f1-score   support

           Yes       0.61      0.80      0.69       131
To some extent       0.29      0.27      0.28        37
            No       0.63      0.33      0.43        80

      accuracy                           0.57       248
     macro avg       0.51      0.47      0.47       248
  weighted avg       0.57      0.57      0.55       248



Epoch 9/10: 100%|██████████| 885/885 [03:08<00:00,  4.71it/s]


Epoch 9 Loss: 0.7972
Validation Accuracy: 0.6169  Validation Macro F1: 0.5032
                precision    recall  f1-score   support

           Yes       0.64      0.88      0.74       131
To some extent       0.45      0.24      0.32        37
            No       0.62      0.36      0.46        80

      accuracy                           0.62       248
     macro avg       0.57      0.49      0.50       248
  weighted avg       0.60      0.62      0.58       248



Epoch 10/10: 100%|██████████| 885/885 [03:08<00:00,  4.71it/s]


Epoch 10 Loss: 0.8147
Validation Accuracy: 0.6250  Validation Macro F1: 0.5208
                precision    recall  f1-score   support

           Yes       0.62      0.91      0.73       131
To some extent       0.45      0.35      0.39        37
            No       0.88      0.29      0.43        80

      accuracy                           0.62       248
     macro avg       0.65      0.52      0.52       248
  weighted avg       0.68      0.62      0.59       248

