In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json


In [2]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1

Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1


In [3]:
from imblearn.over_sampling import SMOTE
import json
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer


In [4]:
# ---------- Dataset Definition ----------
class TutorEvalSingleTaskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [5]:
# ---------- Preprocessing ----------
def load_and_flatten(json_path):
    with open(json_path) as f:
        data = json.load(f)

    rows = []
    for instance in data:
        convo_id = instance["conversation_id"]
        history = instance["conversation_history"]
        for tutor_id, tutor_data in instance["tutor_responses"].items():
            row = {
                "conversation_id": convo_id,
                "tutor_id": tutor_id,
                "conversation_history": history,
                "tutor_response": tutor_data["response"],
                "Mistake_Identification": tutor_data["annotation"]["Mistake_Identification"],
                "Mistake_Location": tutor_data["annotation"]["Mistake_Location"],
                "Pedagogical_Guidance": tutor_data["annotation"]["Providing_Guidance"],
                "Actionability": tutor_data["annotation"]["Actionability"]
            }
            rows.append(row)
    return pd.DataFrame(rows)

In [6]:
def build_input_text(row):
    return f"Context:\n{row['conversation_history']}\n\nTutor Response:\n{row['tutor_response']}"

LABEL_MAP = {"Yes": 0, "To some extent": 1, "No": 2}
MERGED_LABEL_MAP = {"Yes": 1, "To some extent": 1, "No": 0}

def encode_labels(df):
    for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]:
        df[f"{task}_label"] = df[task].map(LABEL_MAP)
        df[f"{task}_binary"] = df[task].map(MERGED_LABEL_MAP)
    return df

def tokenize_inputs(tokenizer, texts, max_length=256):
    return tokenizer(
        texts,
        add_special_tokens=True,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [7]:
def preprocess_dataset(json_path, task_label, mode=None, max_length=256):
    df = load_and_flatten(json_path)
    df["input_text"] = df.apply(build_input_text, axis=1)
    df = encode_labels(df)

    train_df, val_df = train_test_split(df, test_size=0.1, stratify=df[task_label], random_state=42)

    tokenizer = RobertaTokenizer.from_pretrained("roberta-base",do_lower_case=True)
    train_enc = tokenize_inputs(tokenizer, train_df["input_text"].tolist(), max_length=max_length)
    val_enc = tokenize_inputs(tokenizer, val_df["input_text"].tolist(), max_length=max_length)

    train_labels = torch.tensor(train_df[task_label].tolist())
    val_labels = torch.tensor(val_df[task_label].tolist())

    if mode == 'balanced':
        # Perform SMOTE oversampling
        smote = SMOTE(random_state=42)
        
        X1 = train_enc['input_ids']
        X2 = train_enc['attention_mask']
        X = torch.cat((X1, X2), dim=1)  # Combine input_ids and attention_mask for SMOTE
        y = train_labels.numpy()

        X_resampled, y_resampled = smote.fit_resample(X, y)

        # Split back input_ids and attention_mask after SMOTE
        seq_len = X1.shape[1]
        train_enc['input_ids'] = torch.tensor(X_resampled[:, :seq_len])
        train_enc['attention_mask'] = torch.tensor(X_resampled[:, seq_len:])
        train_labels = torch.tensor(y_resampled)

    train_dataset = TutorEvalSingleTaskDataset(train_enc, train_labels)
    val_dataset = TutorEvalSingleTaskDataset(val_enc, val_labels)

    return train_dataset, val_dataset, tokenizer, df

In [8]:
file_path="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
df=load_and_flatten(file_path)
df.head()

Unnamed: 0,conversation_id,tutor_id,conversation_history,tutor_response,Mistake_Identification,Mistake_Location,Pedagogical_Guidance,Actionability
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Sonnet,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Yes,Yes,Yes,Yes
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama318B,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Yes,To some extent,To some extent,To some extent
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama31405B,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Yes,Yes,Yes,Yes
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,GPT4,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",Yes,Yes,Yes,Yes
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Mistral,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Yes,Yes,Yes,Yes


In [9]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

2025-04-27 12:47:04.384001: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745758024.623185      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745758024.692937      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
from transformers import RobertaForSequenceClassification
import torch.nn as nn

class SingleTaskRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(SingleTaskRobertaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
# ---------- Training ----------
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"Validation Accuracy: {acc:.4f}  Validation Macro F1: {macro_f1:.4f}")
    print(classification_report(all_labels, all_preds, target_names=["No","Yes and To some extent"], zero_division=0))

def train_model(loss_type, train_loader, val_loader, num_labels, epochs=10):
    print(f"\n Training with: {loss_type.upper()} Loss\n")

    model = SingleTaskRobertaClassifier(num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    #train_labels_list = [label.item() for batch in train_loader for label in batch['labels']]
    #class_weights = get_class_weights(train_labels_list, num_labels)

    if loss_type == "focal":
        criterion = FocalLossWithWeights()
    elif loss_type == "smoothing":
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(weight=None)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")
        evaluate_model(model, val_loader)



In [13]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Identification_binary", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]


 Training with: CE Loss



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 948/948 [03:22<00:00,  4.67it/s]


Epoch 1 Loss: 0.3550
Validation Accuracy: 0.8508  Validation Macro F1: 0.4597
                        precision    recall  f1-score   support

                    No       0.00      0.00      0.00        37
Yes and To some extent       0.85      1.00      0.92       211

              accuracy                           0.85       248
             macro avg       0.43      0.50      0.46       248
          weighted avg       0.72      0.85      0.78       248



Epoch 2/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 2 Loss: 0.3145
Validation Accuracy: 0.8266  Validation Macro F1: 0.5738
                        precision    recall  f1-score   support

                    No       0.35      0.19      0.25        37
Yes and To some extent       0.87      0.94      0.90       211

              accuracy                           0.83       248
             macro avg       0.61      0.56      0.57       248
          weighted avg       0.79      0.83      0.80       248



Epoch 3/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 3 Loss: 0.2980
Validation Accuracy: 0.8226  Validation Macro F1: 0.5107
                        precision    recall  f1-score   support

                    No       0.23      0.08      0.12        37
Yes and To some extent       0.86      0.95      0.90       211

              accuracy                           0.82       248
             macro avg       0.54      0.52      0.51       248
          weighted avg       0.76      0.82      0.78       248



Epoch 4/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 4 Loss: 0.2995
Validation Accuracy: 0.8387  Validation Macro F1: 0.5387
                        precision    recall  f1-score   support

                    No       0.36      0.11      0.17        37
Yes and To some extent       0.86      0.97      0.91       211

              accuracy                           0.84       248
             macro avg       0.61      0.54      0.54       248
          weighted avg       0.79      0.84      0.80       248



Epoch 5/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 5 Loss: 0.2842
Validation Accuracy: 0.8306  Validation Macro F1: 0.5329
                        precision    recall  f1-score   support

                    No       0.31      0.11      0.16        37
Yes and To some extent       0.86      0.96      0.91       211

              accuracy                           0.83       248
             macro avg       0.58      0.53      0.53       248
          weighted avg       0.78      0.83      0.79       248



Epoch 6/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 6 Loss: 0.3038
Validation Accuracy: 0.8508  Validation Macro F1: 0.4852
                        precision    recall  f1-score   support

                    No       0.50      0.03      0.05        37
Yes and To some extent       0.85      1.00      0.92       211

              accuracy                           0.85       248
             macro avg       0.68      0.51      0.49       248
          weighted avg       0.80      0.85      0.79       248



Epoch 7/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 7 Loss: 0.3210
Validation Accuracy: 0.8185  Validation Macro F1: 0.5540
                        precision    recall  f1-score   support

                    No       0.30      0.16      0.21        37
Yes and To some extent       0.86      0.93      0.90       211

              accuracy                           0.82       248
             macro avg       0.58      0.55      0.55       248
          weighted avg       0.78      0.82      0.80       248



Epoch 8/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 8 Loss: 0.2673
Validation Accuracy: 0.8266  Validation Macro F1: 0.5458
                        precision    recall  f1-score   support

                    No       0.31      0.14      0.19        37
Yes and To some extent       0.86      0.95      0.90       211

              accuracy                           0.83       248
             macro avg       0.59      0.54      0.55       248
          weighted avg       0.78      0.83      0.80       248



Epoch 9/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 9 Loss: 0.2593
Validation Accuracy: 0.8226  Validation Macro F1: 0.5571
                        precision    recall  f1-score   support

                    No       0.32      0.16      0.21        37
Yes and To some extent       0.86      0.94      0.90       211

              accuracy                           0.82       248
             macro avg       0.59      0.55      0.56       248
          weighted avg       0.78      0.82      0.80       248



Epoch 10/10: 100%|██████████| 948/948 [03:30<00:00,  4.50it/s]


Epoch 10 Loss: 0.2631
Validation Accuracy: 0.8145  Validation Macro F1: 0.5760
                        precision    recall  f1-score   support

                    No       0.32      0.22      0.26        37
Yes and To some extent       0.87      0.92      0.89       211

              accuracy                           0.81       248
             macro avg       0.59      0.57      0.58       248
          weighted avg       0.79      0.81      0.80       248



In [15]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Location_binary", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=2)


 Training with: CE Loss



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 1 Loss: 0.5635
Validation Accuracy: 0.7056  Validation Macro F1: 0.5817
                        precision    recall  f1-score   support

                    No       0.48      0.28      0.35        71
Yes and To some extent       0.75      0.88      0.81       177

              accuracy                           0.71       248
             macro avg       0.61      0.58      0.58       248
          weighted avg       0.67      0.71      0.68       248



Epoch 2/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 2 Loss: 0.5337
Validation Accuracy: 0.7258  Validation Macro F1: 0.4600
                        precision    recall  f1-score   support

                    No       1.00      0.04      0.08        71
Yes and To some extent       0.72      1.00      0.84       177

              accuracy                           0.73       248
             macro avg       0.86      0.52      0.46       248
          weighted avg       0.80      0.73      0.62       248



Epoch 3/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 3 Loss: 0.5007
Validation Accuracy: 0.7177  Validation Macro F1: 0.5911
                        precision    recall  f1-score   support

                    No       0.51      0.28      0.36        71
Yes and To some extent       0.76      0.89      0.82       177

              accuracy                           0.72       248
             macro avg       0.63      0.59      0.59       248
          weighted avg       0.69      0.72      0.69       248



Epoch 4/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 4 Loss: 0.4715
Validation Accuracy: 0.7419  Validation Macro F1: 0.6559
                        precision    recall  f1-score   support

                    No       0.57      0.42      0.48        71
Yes and To some extent       0.79      0.87      0.83       177

              accuracy                           0.74       248
             macro avg       0.68      0.65      0.66       248
          weighted avg       0.73      0.74      0.73       248



Epoch 5/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 5 Loss: 0.4617
Validation Accuracy: 0.7500  Validation Macro F1: 0.6281
                        precision    recall  f1-score   support

                    No       0.63      0.31      0.42        71
Yes and To some extent       0.77      0.93      0.84       177

              accuracy                           0.75       248
             macro avg       0.70      0.62      0.63       248
          weighted avg       0.73      0.75      0.72       248



Epoch 6/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 6 Loss: 0.4350
Validation Accuracy: 0.7258  Validation Macro F1: 0.6456
                        precision    recall  f1-score   support

                    No       0.53      0.44      0.48        71
Yes and To some extent       0.79      0.84      0.81       177

              accuracy                           0.73       248
             macro avg       0.66      0.64      0.65       248
          weighted avg       0.71      0.73      0.72       248



Epoch 7/10: 100%|██████████| 793/793 [02:56<00:00,  4.50it/s]


Epoch 7 Loss: 0.4790
Validation Accuracy: 0.7056  Validation Macro F1: 0.5002
                        precision    recall  f1-score   support

                    No       0.44      0.11      0.18        71
Yes and To some extent       0.73      0.94      0.82       177

              accuracy                           0.71       248
             macro avg       0.59      0.53      0.50       248
          weighted avg       0.65      0.71      0.64       248



Epoch 8/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 8 Loss: 0.4613
Validation Accuracy: 0.7137  Validation Macro F1: 0.5932
                        precision    recall  f1-score   support

                    No       0.50      0.30      0.37        71
Yes and To some extent       0.76      0.88      0.81       177

              accuracy                           0.71       248
             macro avg       0.63      0.59      0.59       248
          weighted avg       0.68      0.71      0.69       248



Epoch 9/10: 100%|██████████| 793/793 [02:56<00:00,  4.49it/s]


Epoch 9 Loss: 0.4226
Validation Accuracy: 0.7218  Validation Macro F1: 0.6421
                        precision    recall  f1-score   support

                    No       0.52      0.44      0.47        71
Yes and To some extent       0.79      0.84      0.81       177

              accuracy                           0.72       248
             macro avg       0.65      0.64      0.64       248
          weighted avg       0.71      0.72      0.71       248



Epoch 10/10: 100%|██████████| 793/793 [02:56<00:00,  4.50it/s]


Epoch 10 Loss: 0.4120
Validation Accuracy: 0.7379  Validation Macro F1: 0.6127
                        precision    recall  f1-score   support

                    No       0.58      0.30      0.39        71
Yes and To some extent       0.76      0.92      0.83       177

              accuracy                           0.74       248
             macro avg       0.67      0.61      0.61       248
          weighted avg       0.71      0.74      0.71       248



In [16]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Pedagogical_Guidance_binary", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=2)


 Training with: CE Loss



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 1 Loss: 0.4754
Validation Accuracy: 0.7702  Validation Macro F1: 0.5082
                        precision    recall  f1-score   support

                    No       0.50      0.09      0.15        57
Yes and To some extent       0.78      0.97      0.87       191

              accuracy                           0.77       248
             macro avg       0.64      0.53      0.51       248
          weighted avg       0.72      0.77      0.70       248



Epoch 2/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 2 Loss: 0.4408
Validation Accuracy: 0.7903  Validation Macro F1: 0.5456
                        precision    recall  f1-score   support

                    No       0.78      0.12      0.21        57
Yes and To some extent       0.79      0.99      0.88       191

              accuracy                           0.79       248
             macro avg       0.78      0.56      0.55       248
          weighted avg       0.79      0.79      0.73       248



Epoch 3/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 3 Loss: 0.4173
Validation Accuracy: 0.7863  Validation Macro F1: 0.6090
                        precision    recall  f1-score   support

                    No       0.58      0.25      0.35        57
Yes and To some extent       0.81      0.95      0.87       191

              accuracy                           0.79       248
             macro avg       0.70      0.60      0.61       248
          weighted avg       0.76      0.79      0.75       248



Epoch 4/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 4 Loss: 0.3918
Validation Accuracy: 0.7500  Validation Macro F1: 0.6174
                        precision    recall  f1-score   support

                    No       0.44      0.35      0.39        57
Yes and To some extent       0.82      0.87      0.84       191

              accuracy                           0.75       248
             macro avg       0.63      0.61      0.62       248
          weighted avg       0.73      0.75      0.74       248



Epoch 5/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 5 Loss: 0.3945
Validation Accuracy: 0.7702  Validation Macro F1: 0.5202
                        precision    recall  f1-score   support

                    No       0.50      0.11      0.17        57
Yes and To some extent       0.78      0.97      0.87       191

              accuracy                           0.77       248
             macro avg       0.64      0.54      0.52       248
          weighted avg       0.72      0.77      0.71       248



Epoch 6/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 6 Loss: 0.3779
Validation Accuracy: 0.7944  Validation Macro F1: 0.6080
                        precision    recall  f1-score   support

                    No       0.65      0.23      0.34        57
Yes and To some extent       0.81      0.96      0.88       191

              accuracy                           0.79       248
             macro avg       0.73      0.60      0.61       248
          weighted avg       0.77      0.79      0.75       248



Epoch 7/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 7 Loss: 0.3651
Validation Accuracy: 0.7782  Validation Macro F1: 0.6361
                        precision    recall  f1-score   support

                    No       0.53      0.33      0.41        57
Yes and To some extent       0.82      0.91      0.86       191

              accuracy                           0.78       248
             macro avg       0.67      0.62      0.64       248
          weighted avg       0.75      0.78      0.76       248



Epoch 8/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 8 Loss: 0.3527
Validation Accuracy: 0.7782  Validation Macro F1: 0.6530
                        precision    recall  f1-score   support

                    No       0.52      0.39      0.44        57
Yes and To some extent       0.83      0.90      0.86       191

              accuracy                           0.78       248
             macro avg       0.68      0.64      0.65       248
          weighted avg       0.76      0.78      0.77       248



Epoch 9/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 9 Loss: 0.3374
Validation Accuracy: 0.7823  Validation Macro F1: 0.6130
                        precision    recall  f1-score   support

                    No       0.56      0.26      0.36        57
Yes and To some extent       0.81      0.94      0.87       191

              accuracy                           0.78       248
             macro avg       0.68      0.60      0.61       248
          weighted avg       0.75      0.78      0.75       248



Epoch 10/10: 100%|██████████| 860/860 [03:11<00:00,  4.50it/s]


Epoch 10 Loss: 0.3343
Validation Accuracy: 0.7742  Validation Macro F1: 0.6593
                        precision    recall  f1-score   support

                    No       0.51      0.42      0.46        57
Yes and To some extent       0.84      0.88      0.86       191

              accuracy                           0.77       248
             macro avg       0.67      0.65      0.66       248
          weighted avg       0.76      0.77      0.77       248



In [17]:
# ---------- Run ----------
json_path ="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Actionability_binary", mode = 'balanced')
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
train_model("ce", train_loader, val_loader, num_labels=2)


 Training with: CE Loss



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 1 Loss: 0.5373
Validation Accuracy: 0.7177  Validation Macro F1: 0.6387
                        precision    recall  f1-score   support

                    No       0.60      0.39      0.47        80
Yes and To some extent       0.75      0.88      0.81       168

              accuracy                           0.72       248
             macro avg       0.67      0.63      0.64       248
          weighted avg       0.70      0.72      0.70       248



Epoch 2/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 2 Loss: 0.4830
Validation Accuracy: 0.7661  Validation Macro F1: 0.6656
                        precision    recall  f1-score   support

                    No       0.84      0.34      0.48        80
Yes and To some extent       0.75      0.97      0.85       168

              accuracy                           0.77       248
             macro avg       0.80      0.65      0.67       248
          weighted avg       0.78      0.77      0.73       248



Epoch 3/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 3 Loss: 0.4618
Validation Accuracy: 0.7460  Validation Macro F1: 0.6631
                        precision    recall  f1-score   support

                    No       0.69      0.39      0.50        80
Yes and To some extent       0.76      0.92      0.83       168

              accuracy                           0.75       248
             macro avg       0.72      0.65      0.66       248
          weighted avg       0.74      0.75      0.72       248



Epoch 4/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 4 Loss: 0.4422
Validation Accuracy: 0.7460  Validation Macro F1: 0.6557
                        precision    recall  f1-score   support

                    No       0.71      0.36      0.48        80
Yes and To some extent       0.75      0.93      0.83       168

              accuracy                           0.75       248
             macro avg       0.73      0.65      0.66       248
          weighted avg       0.74      0.75      0.72       248



Epoch 5/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 5 Loss: 0.4421
Validation Accuracy: 0.7621  Validation Macro F1: 0.6436
                        precision    recall  f1-score   support

                    No       0.92      0.29      0.44        80
Yes and To some extent       0.74      0.99      0.85       168

              accuracy                           0.76       248
             macro avg       0.83      0.64      0.64       248
          weighted avg       0.80      0.76      0.72       248



Epoch 6/10: 100%|██████████| 756/756 [02:48<00:00,  4.50it/s]


Epoch 6 Loss: 0.4097
Validation Accuracy: 0.7218  Validation Macro F1: 0.6638
                        precision    recall  f1-score   support

                    No       0.58      0.47      0.52        80
Yes and To some extent       0.77      0.84      0.80       168

              accuracy                           0.72       248
             macro avg       0.68      0.66      0.66       248
          weighted avg       0.71      0.72      0.71       248



Epoch 7/10: 100%|██████████| 756/756 [02:48<00:00,  4.50it/s]


Epoch 7 Loss: 0.4046
Validation Accuracy: 0.7419  Validation Macro F1: 0.6355
                        precision    recall  f1-score   support

                    No       0.74      0.31      0.44        80
Yes and To some extent       0.74      0.95      0.83       168

              accuracy                           0.74       248
             macro avg       0.74      0.63      0.64       248
          weighted avg       0.74      0.74      0.71       248



Epoch 8/10: 100%|██████████| 756/756 [02:48<00:00,  4.50it/s]


Epoch 8 Loss: 0.3924
Validation Accuracy: 0.7056  Validation Macro F1: 0.6443
                        precision    recall  f1-score   support

                    No       0.55      0.45      0.50        80
Yes and To some extent       0.76      0.83      0.79       168

              accuracy                           0.71       248
             macro avg       0.66      0.64      0.64       248
          weighted avg       0.69      0.71      0.70       248



Epoch 9/10: 100%|██████████| 756/756 [02:48<00:00,  4.50it/s]


Epoch 9 Loss: 0.3868
Validation Accuracy: 0.7540  Validation Macro F1: 0.6867
                        precision    recall  f1-score   support

                    No       0.68      0.45      0.54        80
Yes and To some extent       0.77      0.90      0.83       168

              accuracy                           0.75       248
             macro avg       0.73      0.67      0.69       248
          weighted avg       0.74      0.75      0.74       248



Epoch 10/10: 100%|██████████| 756/756 [02:47<00:00,  4.50it/s]


Epoch 10 Loss: 0.3731
Validation Accuracy: 0.7298  Validation Macro F1: 0.6417
                        precision    recall  f1-score   support

                    No       0.64      0.36      0.46        80
Yes and To some extent       0.75      0.90      0.82       168

              accuracy                           0.73       248
             macro avg       0.70      0.63      0.64       248
          weighted avg       0.72      0.73      0.70       248

