In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json


In [2]:
import json
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [3]:
def load_and_flatten(json_path):
    with open(json_path) as f:
        data = json.load(f)

    rows = []
    for instance in data:
        convo_id = instance["conversation_id"]
        history = instance["conversation_history"]

        for tutor_id, tutor_data in instance["tutor_responses"].items():
            row = {
                "conversation_id": convo_id,
                "tutor_id": tutor_id,
                "conversation_history": history,
                "tutor_response": tutor_data["response"],
                "Mistake_Identification": tutor_data["annotation"]["Mistake_Identification"],
                "Mistake_Location": tutor_data["annotation"]["Mistake_Location"],
                "Pedagogical_Guidance": tutor_data["annotation"]["Providing_Guidance"],
                "Actionability": tutor_data["annotation"]["Actionability"]
            }
            rows.append(row)

    return pd.DataFrame(rows)

def build_input_text(row):
    return f"Context:\n{row['conversation_history']}\n\nTutor Response:\n{row['tutor_response']}"

# Encode labels
LABEL_MAP = {"Yes": 0, "To some extent": 1, "No": 2}
MERGED_LABEL_MAP = {"Yes": 1, "To some extent": 1, "No": 0}  # For lenient setting

def encode_labels(df):
    for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]:
        df[f"{task}_label"] = df[task].map(LABEL_MAP)
        df[f"{task}_binary"] = df[task].map(MERGED_LABEL_MAP)
    return df

# Tokenize inputs
def tokenize_inputs(tokenizer, texts, max_length=256):
    return tokenizer(
        texts,
        add_special_tokens=True,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [4]:
file_path="/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
df=load_and_flatten(file_path)
df.head()

Unnamed: 0,conversation_id,tutor_id,conversation_history,tutor_response,Mistake_Identification,Mistake_Location,Pedagogical_Guidance,Actionability
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Sonnet,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Yes,Yes,Yes,Yes
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama318B,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Yes,To some extent,To some extent,To some extent
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Llama31405B,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Yes,Yes,Yes,Yes
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,GPT4,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",Yes,Yes,Yes,Yes
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,Mistral,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Yes,Yes,Yes,Yes


In [5]:
class TutorEvalSingleTaskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)



def preprocess_dataset(json_path,task_label):
    df = load_and_flatten(json_path)
    df["input_text"] = df.apply(build_input_text, axis=1)
    df = encode_labels(df)

    train_df, val_df = train_test_split(df, test_size=0.1, stratify=df[task_label], random_state=42)

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    train_enc = tokenize_inputs(tokenizer, train_df["input_text"].tolist())
    val_enc = tokenize_inputs(tokenizer, val_df["input_text"].tolist())

    
    train_labels = torch.tensor(train_df[task_label].tolist())
    val_labels = torch.tensor(val_df[task_label].tolist())

    train_dataset = TutorEvalSingleTaskDataset(train_enc, train_labels)
    val_dataset = TutorEvalSingleTaskDataset(val_enc, val_labels)

    return train_dataset, val_dataset, tokenizer, df


In [6]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

2025-04-24 19:07:43.981992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745521664.203947      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745521664.267769      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
from transformers import RobertaForSequenceClassification
import torch.nn as nn

class SingleTaskRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(SingleTaskRobertaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha  # class weighting (list or tensor)
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)  # pt = softmax probability of the true class
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss



In [10]:
def train_model(loss_type, train_loader, val_loader, num_labels, df,task,epochs=10):
    print(f"\nTraining with: {loss_type.upper()} Loss\n")

    model = SingleTaskRobertaClassifier(num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    
    labels = df[task]
    unique_classes = labels.unique()
    class_weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=labels)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)


    if loss_type == "focal":
        alpha=class_weights_tensor
        criterion = FocalLoss(alpha=alpha, gamma=2.0)
    elif loss_type == "smoothing":
        criterion = nn.CrossEntropyLoss(weight=class_weights_tensor,label_smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")
    x,y,z=evaluate_model(model, val_loader)
    print(f"Validation Accuracy: {y:.4f}  Validation Macro F1: {z:.4f}")
    print(f"Classification report: ",x)

        
def evaluate_model(model, val_loader):
    label_names = ["Yes", "To some extent", "No"]
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()  # Change the task name if needed
            
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels)
    
    accuracy = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    return classification_report(all_labels, all_preds, target_names=label_names, zero_division=0),accuracy,macro_f1


In [11]:
import warnings

warnings.filterwarnings("ignore", message="Some weights of RobertaForSequenceClassification were not initialized.*")


In [12]:
from transformers import logging

logging.set_verbosity_error()  # suppress all warnings and info logs from transformers


In [13]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

json_path = "/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Identification_label")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

for loss_name in ["ce", "smoothing", "focal"]:
    train_model(loss_name, train_loader, val_loader, 3,df,task="Mistake_Identification_label")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Training with: CE Loss



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 1 Loss: 0.9952
Epoch 2 Loss: 0.9840
Epoch 3 Loss: 0.9869
Epoch 4 Loss: 0.9690
Epoch 5 Loss: 0.9745
Epoch 6 Loss: 0.9802
Epoch 7 Loss: 0.9734
Epoch 8 Loss: 0.9754
Epoch 9 Loss: 0.9653
Epoch 10 Loss: 0.9670
Validation Accuracy: 0.3669  Validation Macro F1: 0.2480
Classification report:                  precision    recall  f1-score   support

           Yes       0.82      0.30      0.44       194
To some extent       0.00      0.00      0.00        17
            No       0.18      0.86      0.30        37

      accuracy                           0.37       248
     macro avg       0.33      0.39      0.25       248
  weighted avg       0.67      0.37      0.39       248


Training with: SMOOTHING Loss

Epoch 1 Loss: 1.2711
Epoch 2 Loss: 1.2682
Epoch 3 Loss: 1.2666
Epoch 4 Loss: 1.2562
Epoch 5 Loss: 1.2402
Epoch 6 Loss: 1.2442
Epoch 7 Loss: 1.2420
Epoch 8 Loss: 1.2497
Epoch 9 Loss: 1.2210
Epoch 10 Loss: 1.2305
Validation Accuracy: 0.3589  Validation Macro F1: 0.3027
Classificatio

In [14]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

json_path = "/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Location_label")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

for loss_name in ["ce", "smoothing", "focal"]:
    train_model(loss_name, train_loader, val_loader, 3,df,task="Mistake_Location_label")


Training with: CE Loss

Epoch 1 Loss: 1.0512
Epoch 2 Loss: 1.0689
Epoch 3 Loss: 1.0459
Epoch 4 Loss: 1.0258
Epoch 5 Loss: 1.0235
Epoch 6 Loss: 1.0639
Epoch 7 Loss: 1.0571
Epoch 8 Loss: 1.0376
Epoch 9 Loss: 1.0329
Epoch 10 Loss: 1.0189
Validation Accuracy: 0.4113  Validation Macro F1: 0.2806
Classification report:                  precision    recall  f1-score   support

           Yes       0.80      0.23      0.36       155
To some extent       0.00      0.00      0.00        22
            No       0.33      0.93      0.48        71

      accuracy                           0.41       248
     macro avg       0.38      0.39      0.28       248
  weighted avg       0.59      0.41      0.36       248


Training with: SMOOTHING Loss

Epoch 1 Loss: 1.2207
Epoch 2 Loss: 1.2106
Epoch 3 Loss: 1.1971
Epoch 4 Loss: 1.1988
Epoch 5 Loss: 1.2253
Epoch 6 Loss: 1.2233
Epoch 7 Loss: 1.2218
Epoch 8 Loss: 1.2235
Epoch 9 Loss: 1.2217
Epoch 10 Loss: 1.2240
Validation Accuracy: 0.6250  Validation Macro

In [15]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

json_path = "/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Pedagogical_Guidance_label")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

for loss_name in ["ce", "smoothing", "focal"]:
    train_model(loss_name, train_loader, val_loader, 3,df,task="Pedagogical_Guidance_label")


Training with: CE Loss

Epoch 1 Loss: 1.0982
Epoch 2 Loss: 1.0891
Epoch 3 Loss: 1.0793
Epoch 4 Loss: 1.0816
Epoch 5 Loss: 1.0726
Epoch 6 Loss: 1.0587
Epoch 7 Loss: 1.0611
Epoch 8 Loss: 1.0455
Epoch 9 Loss: 1.0311
Epoch 10 Loss: 1.0050
Validation Accuracy: 0.5403  Validation Macro F1: 0.3984
Classification report:                  precision    recall  f1-score   support

           Yes       0.60      0.80      0.68       141
To some extent       0.26      0.20      0.22        50
            No       0.55      0.19      0.29        57

      accuracy                           0.54       248
     macro avg       0.47      0.40      0.40       248
  weighted avg       0.52      0.54      0.50       248


Training with: SMOOTHING Loss

Epoch 1 Loss: 1.1386
Epoch 2 Loss: 1.1310
Epoch 3 Loss: 1.1296
Epoch 4 Loss: 1.1214
Epoch 5 Loss: 1.1185
Epoch 6 Loss: 1.1176
Epoch 7 Loss: 1.1108
Epoch 8 Loss: 1.1142
Epoch 9 Loss: 1.1095
Epoch 10 Loss: 1.1051
Validation Accuracy: 0.5847  Validation Macro

In [16]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

json_path = "/kaggle/input/nlp_ass_3/pytorch/default/1/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Actionability_label")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

for loss_name in ["ce", "smoothing", "focal"]:
    train_model(loss_name, train_loader, val_loader, 3,df,task="Actionability_label")


Training with: CE Loss

Epoch 1 Loss: 1.0926
Epoch 2 Loss: 1.0831
Epoch 3 Loss: 1.0870
Epoch 4 Loss: 1.0834
Epoch 5 Loss: 1.0727
Epoch 6 Loss: 1.0407
Epoch 7 Loss: 1.0332
Epoch 8 Loss: 1.0471
Epoch 9 Loss: 1.0203
Epoch 10 Loss: 1.0061
Validation Accuracy: 0.6008  Validation Macro F1: 0.3954
Classification report:                  precision    recall  f1-score   support

           Yes       0.60      0.91      0.72       131
To some extent       0.00      0.00      0.00        37
            No       0.61      0.38      0.47        80

      accuracy                           0.60       248
     macro avg       0.40      0.43      0.40       248
  weighted avg       0.51      0.60      0.53       248


Training with: SMOOTHING Loss

Epoch 1 Loss: 1.1422
Epoch 2 Loss: 1.1375
Epoch 3 Loss: 1.1174
Epoch 4 Loss: 1.0856
Epoch 5 Loss: 1.0786
Epoch 6 Loss: 1.0635
Epoch 7 Loss: 1.0547
Epoch 8 Loss: 1.0566
Epoch 9 Loss: 1.0288
Epoch 10 Loss: 1.0256
Validation Accuracy: 0.6008  Validation Macro