In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/json-data/assignment_3_ai_tutors_dataset.json


In [2]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

2025-04-25 07:35:43.551552: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745566543.874728      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745566543.955159      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class FocalLossWithWeights(nn.Module):
    def __init__(self, weight=None, gamma=2.0):
        super(FocalLossWithWeights, self).__init__()
        self.weight = weight
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(reduction='none', weight=weight)

    def forward(self, inputs, targets):
        logp = self.ce(inputs, targets)
        p = torch.exp(-logp)
        loss = (1 - p) ** self.gamma * logp
        return loss.mean()


In [5]:
# ---------- Dataset Definition ----------
class TutorEvalSingleTaskDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [6]:
# ---------- Preprocessing ----------
def load_and_flatten(json_path):
    with open(json_path) as f:
        data = json.load(f)

    rows = []
    for instance in data:
        convo_id = instance["conversation_id"]
        history = instance["conversation_history"]
        for tutor_id, tutor_data in instance["tutor_responses"].items():
            row = {
                "conversation_id": convo_id,
                "tutor_id": tutor_id,
                "conversation_history": history,
                "tutor_response": tutor_data["response"],
                "Mistake_Identification": tutor_data["annotation"]["Mistake_Identification"],
                "Mistake_Location": tutor_data["annotation"]["Mistake_Location"],
                "Pedagogical_Guidance": tutor_data["annotation"]["Providing_Guidance"],
                "Actionability": tutor_data["annotation"]["Actionability"]
            }
            rows.append(row)
    return pd.DataFrame(rows)

In [7]:
def build_input_text(row):
    return f"Context:\n{row['conversation_history']}\n\nTutor Response:\n{row['tutor_response']}"

LABEL_MAP = {"Yes": 0, "To some extent": 1, "No": 2}
MERGED_LABEL_MAP = {"Yes": 1, "To some extent": 1, "No": 0}

def encode_labels(df):
    for task in ["Mistake_Identification", "Mistake_Location", "Pedagogical_Guidance", "Actionability"]:
        df[f"{task}_label"] = df[task].map(LABEL_MAP)
        df[f"{task}_binary"] = df[task].map(MERGED_LABEL_MAP)
    return df

def tokenize_inputs(tokenizer, texts, max_length=256):
    return tokenizer(
        texts,
        add_special_tokens=True,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [8]:
def preprocess_dataset(json_path, task_label):
    df = load_and_flatten(json_path)
    df["input_text"] = df.apply(build_input_text, axis=1)
    df = encode_labels(df)

    train_df, val_df = train_test_split(df, test_size=0.1, stratify=df[task_label], random_state=42)

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    train_enc = tokenize_inputs(tokenizer, train_df["input_text"].tolist())
    val_enc = tokenize_inputs(tokenizer, val_df["input_text"].tolist())

    train_labels = torch.tensor(train_df[task_label].tolist())
    val_labels = torch.tensor(val_df[task_label].tolist())

    train_dataset = TutorEvalSingleTaskDataset(train_enc, train_labels)
    val_dataset = TutorEvalSingleTaskDataset(val_enc, val_labels)

    return train_dataset, val_dataset, tokenizer, df


In [9]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

def get_class_weights(labels, num_classes):
    class_weights = compute_class_weight(class_weight='balanced', classes=np.arange(num_classes), y=labels)
    return torch.tensor(class_weights, dtype=torch.float).to(device)


In [10]:
# ---------- Model ----------
class SingleTaskBertClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask).logits

In [11]:
# ---------- Training ----------
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"Validation Accuracy: {acc:.4f}  Validation Macro F1: {macro_f1:.4f}")
    print(classification_report(all_labels, all_preds, target_names=["Yes/To some extent", "No"], zero_division=0))

def train_model(loss_type, train_loader, val_loader, num_labels, epochs=10):
    print(f"\n🔁 Training with: {loss_type.upper()} Loss\n")

    model = SingleTaskBertClassifier(num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    train_labels_list = [label.item() for batch in train_loader for label in batch['labels']]
    class_weights = get_class_weights(train_labels_list, num_labels)

    if loss_type == "focal":
        criterion = FocalLossWithWeights(class_weights)
    elif loss_type == "smoothing":
        criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(weight=None)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")
        evaluate_model(model, val_loader)



In [12]:
# ---------- Run ----------
json_path = "/kaggle/input/json-data/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Identification_binary")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
train_model("ce", train_loader, val_loader, num_labels=2)


🔁 Training with: CE Loss



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 557/557 [01:51<00:00,  4.99it/s]


Epoch 1 Loss: 0.4271
Validation Accuracy: 0.8508  Validation Macro F1: 0.4597
                    precision    recall  f1-score   support

Yes/To some extent       0.00      0.00      0.00        37
                No       0.85      1.00      0.92       211

          accuracy                           0.85       248
         macro avg       0.43      0.50      0.46       248
      weighted avg       0.72      0.85      0.78       248



Epoch 2/10: 100%|██████████| 557/557 [02:00<00:00,  4.64it/s]


Epoch 2 Loss: 0.4169
Validation Accuracy: 0.8508  Validation Macro F1: 0.4597
                    precision    recall  f1-score   support

Yes/To some extent       0.00      0.00      0.00        37
                No       0.85      1.00      0.92       211

          accuracy                           0.85       248
         macro avg       0.43      0.50      0.46       248
      weighted avg       0.72      0.85      0.78       248



Epoch 3/10: 100%|██████████| 557/557 [02:03<00:00,  4.50it/s]


Epoch 3 Loss: 0.4156
Validation Accuracy: 0.8508  Validation Macro F1: 0.4597
                    precision    recall  f1-score   support

Yes/To some extent       0.00      0.00      0.00        37
                No       0.85      1.00      0.92       211

          accuracy                           0.85       248
         macro avg       0.43      0.50      0.46       248
      weighted avg       0.72      0.85      0.78       248



Epoch 4/10: 100%|██████████| 557/557 [02:04<00:00,  4.46it/s]


Epoch 4 Loss: 0.3977
Validation Accuracy: 0.8548  Validation Macro F1: 0.5318
                    precision    recall  f1-score   support

Yes/To some extent       0.60      0.08      0.14        37
                No       0.86      0.99      0.92       211

          accuracy                           0.85       248
         macro avg       0.73      0.54      0.53       248
      weighted avg       0.82      0.85      0.80       248



Epoch 5/10: 100%|██████████| 557/557 [02:05<00:00,  4.43it/s]


Epoch 5 Loss: 0.3705
Validation Accuracy: 0.8548  Validation Macro F1: 0.5687
                    precision    recall  f1-score   support

Yes/To some extent       0.56      0.14      0.22        37
                No       0.87      0.98      0.92       211

          accuracy                           0.85       248
         macro avg       0.71      0.56      0.57       248
      weighted avg       0.82      0.85      0.82       248



Epoch 6/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 6 Loss: 0.3471
Validation Accuracy: 0.8548  Validation Macro F1: 0.5511
                    precision    recall  f1-score   support

Yes/To some extent       0.57      0.11      0.18        37
                No       0.86      0.99      0.92       211

          accuracy                           0.85       248
         macro avg       0.72      0.55      0.55       248
      weighted avg       0.82      0.85      0.81       248



Epoch 7/10: 100%|██████████| 557/557 [02:06<00:00,  4.42it/s]


Epoch 7 Loss: 0.3270
Validation Accuracy: 0.8589  Validation Macro F1: 0.5544
                    precision    recall  f1-score   support

Yes/To some extent       0.67      0.11      0.19        37
                No       0.86      0.99      0.92       211

          accuracy                           0.86       248
         macro avg       0.77      0.55      0.55       248
      weighted avg       0.83      0.86      0.81       248



Epoch 8/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 8 Loss: 0.3035
Validation Accuracy: 0.8669  Validation Macro F1: 0.5967
                    precision    recall  f1-score   support

Yes/To some extent       0.75      0.16      0.27        37
                No       0.87      0.99      0.93       211

          accuracy                           0.87       248
         macro avg       0.81      0.58      0.60       248
      weighted avg       0.85      0.87      0.83       248



Epoch 9/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 9 Loss: 0.3085
Validation Accuracy: 0.8629  Validation Macro F1: 0.5760
                    precision    recall  f1-score   support

Yes/To some extent       0.71      0.14      0.23        37
                No       0.87      0.99      0.92       211

          accuracy                           0.86       248
         macro avg       0.79      0.56      0.58       248
      weighted avg       0.84      0.86      0.82       248



Epoch 10/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 10 Loss: 0.2968
Validation Accuracy: 0.8669  Validation Macro F1: 0.6122
                    precision    recall  f1-score   support

Yes/To some extent       0.70      0.19      0.30        37
                No       0.87      0.99      0.93       211

          accuracy                           0.87       248
         macro avg       0.79      0.59      0.61       248
      weighted avg       0.85      0.87      0.83       248



In [14]:
# ---------- Run ----------
json_path = "/kaggle/input/json-data/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Mistake_Location_binary")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)



In [15]:
train_model("ce", train_loader, val_loader, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔁 Training with: CE Loss



Epoch 1/10: 100%|██████████| 557/557 [02:04<00:00,  4.47it/s]


Epoch 1 Loss: 0.5871
Validation Accuracy: 0.7177  Validation Macro F1: 0.4313
                    precision    recall  f1-score   support

Yes/To some extent       1.00      0.01      0.03        71
                No       0.72      1.00      0.83       177

          accuracy                           0.72       248
         macro avg       0.86      0.51      0.43       248
      weighted avg       0.80      0.72      0.60       248



Epoch 2/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 2 Loss: 0.5408
Validation Accuracy: 0.7339  Validation Macro F1: 0.4980
                    precision    recall  f1-score   support

Yes/To some extent       0.86      0.08      0.15        71
                No       0.73      0.99      0.84       177

          accuracy                           0.73       248
         macro avg       0.79      0.54      0.50       248
      weighted avg       0.77      0.73      0.65       248



Epoch 3/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 3 Loss: 0.5033
Validation Accuracy: 0.7419  Validation Macro F1: 0.5325
                    precision    recall  f1-score   support

Yes/To some extent       0.82      0.13      0.22        71
                No       0.74      0.99      0.85       177

          accuracy                           0.74       248
         macro avg       0.78      0.56      0.53       248
      weighted avg       0.76      0.74      0.67       248



Epoch 4/10: 100%|██████████| 557/557 [02:06<00:00,  4.39it/s]


Epoch 4 Loss: 0.4672
Validation Accuracy: 0.7258  Validation Macro F1: 0.5537
                    precision    recall  f1-score   support

Yes/To some extent       0.57      0.18      0.28        71
                No       0.74      0.94      0.83       177

          accuracy                           0.73       248
         macro avg       0.65      0.56      0.55       248
      weighted avg       0.69      0.73      0.67       248



Epoch 5/10: 100%|██████████| 557/557 [02:06<00:00,  4.39it/s]


Epoch 5 Loss: 0.4459
Validation Accuracy: 0.7460  Validation Macro F1: 0.6246
                    precision    recall  f1-score   support

Yes/To some extent       0.61      0.31      0.41        71
                No       0.77      0.92      0.84       177

          accuracy                           0.75       248
         macro avg       0.69      0.62      0.62       248
      weighted avg       0.72      0.75      0.72       248



Epoch 6/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 6 Loss: 0.4353
Validation Accuracy: 0.7258  Validation Macro F1: 0.5385
                    precision    recall  f1-score   support

Yes/To some extent       0.58      0.15      0.24        71
                No       0.74      0.95      0.83       177

          accuracy                           0.73       248
         macro avg       0.66      0.55      0.54       248
      weighted avg       0.69      0.73      0.66       248



Epoch 7/10: 100%|██████████| 557/557 [02:06<00:00,  4.39it/s]


Epoch 7 Loss: 0.4255
Validation Accuracy: 0.7218  Validation Macro F1: 0.5434
                    precision    recall  f1-score   support

Yes/To some extent       0.55      0.17      0.26        71
                No       0.74      0.94      0.83       177

          accuracy                           0.72       248
         macro avg       0.64      0.56      0.54       248
      weighted avg       0.68      0.72      0.67       248



Epoch 8/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 8 Loss: 0.4049
Validation Accuracy: 0.7339  Validation Macro F1: 0.5737
                    precision    recall  f1-score   support

Yes/To some extent       0.60      0.21      0.31        71
                No       0.75      0.94      0.84       177

          accuracy                           0.73       248
         macro avg       0.67      0.58      0.57       248
      weighted avg       0.71      0.73      0.69       248



Epoch 9/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 9 Loss: 0.4006
Validation Accuracy: 0.7379  Validation Macro F1: 0.5835
                    precision    recall  f1-score   support

Yes/To some extent       0.62      0.23      0.33        71
                No       0.75      0.94      0.84       177

          accuracy                           0.74       248
         macro avg       0.68      0.58      0.58       248
      weighted avg       0.71      0.74      0.69       248



Epoch 10/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 10 Loss: 0.3880
Validation Accuracy: 0.7379  Validation Macro F1: 0.5899
                    precision    recall  f1-score   support

Yes/To some extent       0.61      0.24      0.34        71
                No       0.75      0.94      0.84       177

          accuracy                           0.74       248
         macro avg       0.68      0.59      0.59       248
      weighted avg       0.71      0.74      0.70       248



In [16]:
# ---------- Run ----------
json_path = "/kaggle/input/json-data/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Pedagogical_Guidance_binary")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

In [17]:
train_model("ce", train_loader, val_loader, num_labels=2)


🔁 Training with: CE Loss



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 557/557 [02:04<00:00,  4.47it/s]


Epoch 1 Loss: 0.5455
Validation Accuracy: 0.7823  Validation Macro F1: 0.5023
                    precision    recall  f1-score   support

Yes/To some extent       0.80      0.07      0.13        57
                No       0.78      0.99      0.88       191

          accuracy                           0.78       248
         macro avg       0.79      0.53      0.50       248
      weighted avg       0.79      0.78      0.70       248



Epoch 2/10: 100%|██████████| 557/557 [02:05<00:00,  4.43it/s]


Epoch 2 Loss: 0.5235
Validation Accuracy: 0.7702  Validation Macro F1: 0.4351
                    precision    recall  f1-score   support

Yes/To some extent       0.00      0.00      0.00        57
                No       0.77      1.00      0.87       191

          accuracy                           0.77       248
         macro avg       0.39      0.50      0.44       248
      weighted avg       0.59      0.77      0.67       248



Epoch 3/10: 100%|██████████| 557/557 [02:06<00:00,  4.42it/s]


Epoch 3 Loss: 0.5040
Validation Accuracy: 0.7984  Validation Macro F1: 0.5739
                    precision    recall  f1-score   support

Yes/To some extent       0.82      0.16      0.26        57
                No       0.80      0.99      0.88       191

          accuracy                           0.80       248
         macro avg       0.81      0.57      0.57       248
      weighted avg       0.80      0.80      0.74       248



Epoch 4/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 4 Loss: 0.4665
Validation Accuracy: 0.7944  Validation Macro F1: 0.5904
                    precision    recall  f1-score   support

Yes/To some extent       0.69      0.19      0.30        57
                No       0.80      0.97      0.88       191

          accuracy                           0.79       248
         macro avg       0.74      0.58      0.59       248
      weighted avg       0.78      0.79      0.75       248



Epoch 5/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 5 Loss: 0.4452
Validation Accuracy: 0.7863  Validation Macro F1: 0.6166
                    precision    recall  f1-score   support

Yes/To some extent       0.58      0.26      0.36        57
                No       0.81      0.94      0.87       191

          accuracy                           0.79       248
         macro avg       0.69      0.60      0.62       248
      weighted avg       0.76      0.79      0.75       248



Epoch 6/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 6 Loss: 0.4137
Validation Accuracy: 0.8105  Validation Macro F1: 0.6225
                    precision    recall  f1-score   support

Yes/To some extent       0.81      0.23      0.36        57
                No       0.81      0.98      0.89       191

          accuracy                           0.81       248
         macro avg       0.81      0.61      0.62       248
      weighted avg       0.81      0.81      0.77       248



Epoch 7/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 7 Loss: 0.3984
Validation Accuracy: 0.8065  Validation Macro F1: 0.6271
                    precision    recall  f1-score   support

Yes/To some extent       0.74      0.25      0.37        57
                No       0.81      0.97      0.89       191

          accuracy                           0.81       248
         macro avg       0.77      0.61      0.63       248
      weighted avg       0.79      0.81      0.77       248



Epoch 8/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 8 Loss: 0.3741
Validation Accuracy: 0.7944  Validation Macro F1: 0.6445
                    precision    recall  f1-score   support

Yes/To some extent       0.60      0.32      0.41        57
                No       0.82      0.94      0.88       191

          accuracy                           0.79       248
         macro avg       0.71      0.63      0.64       248
      weighted avg       0.77      0.79      0.77       248



Epoch 9/10: 100%|██████████| 557/557 [02:06<00:00,  4.39it/s]


Epoch 9 Loss: 0.3855
Validation Accuracy: 0.8024  Validation Macro F1: 0.6065
                    precision    recall  f1-score   support

Yes/To some extent       0.75      0.21      0.33        57
                No       0.81      0.98      0.88       191

          accuracy                           0.80       248
         macro avg       0.78      0.59      0.61       248
      weighted avg       0.79      0.80      0.76       248



Epoch 10/10: 100%|██████████| 557/557 [02:06<00:00,  4.39it/s]


Epoch 10 Loss: 0.3625
Validation Accuracy: 0.7903  Validation Macro F1: 0.6045
                    precision    recall  f1-score   support

Yes/To some extent       0.62      0.23      0.33        57
                No       0.81      0.96      0.88       191

          accuracy                           0.79       248
         macro avg       0.71      0.59      0.60       248
      weighted avg       0.76      0.79      0.75       248



In [18]:
# ---------- Run ----------
json_path = "/kaggle/input/json-data/assignment_3_ai_tutors_dataset.json"
train_dataset, val_dataset, tokenizer, df = preprocess_dataset(json_path, "Actionability_binary")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

In [19]:
train_model("ce", train_loader, val_loader, num_labels=2)


🔁 Training with: CE Loss



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 557/557 [02:04<00:00,  4.47it/s]


Epoch 1 Loss: 0.5778
Validation Accuracy: 0.7581  Validation Macro F1: 0.6496
                    precision    recall  f1-score   support

Yes/To some extent       0.83      0.31      0.45        80
                No       0.75      0.97      0.84       168

          accuracy                           0.76       248
         macro avg       0.79      0.64      0.65       248
      weighted avg       0.78      0.76      0.72       248



Epoch 2/10: 100%|██████████| 557/557 [02:05<00:00,  4.42it/s]


Epoch 2 Loss: 0.5435
Validation Accuracy: 0.7621  Validation Macro F1: 0.6531
                    precision    recall  f1-score   support

Yes/To some extent       0.86      0.31      0.46        80
                No       0.75      0.98      0.85       168

          accuracy                           0.76       248
         macro avg       0.81      0.64      0.65       248
      weighted avg       0.79      0.76      0.72       248



Epoch 3/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 3 Loss: 0.5160
Validation Accuracy: 0.7581  Validation Macro F1: 0.6583
                    precision    recall  f1-score   support

Yes/To some extent       0.79      0.34      0.47        80
                No       0.75      0.96      0.84       168

          accuracy                           0.76       248
         macro avg       0.77      0.65      0.66       248
      weighted avg       0.77      0.76      0.72       248



Epoch 4/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 4 Loss: 0.4950
Validation Accuracy: 0.7460  Validation Macro F1: 0.6666
                    precision    recall  f1-score   support

Yes/To some extent       0.68      0.40      0.50        80
                No       0.76      0.91      0.83       168

          accuracy                           0.75       248
         macro avg       0.72      0.66      0.67       248
      weighted avg       0.74      0.75      0.72       248



Epoch 5/10: 100%|██████████| 557/557 [02:06<00:00,  4.42it/s]


Epoch 5 Loss: 0.4569
Validation Accuracy: 0.6935  Validation Macro F1: 0.6446
                    precision    recall  f1-score   support

Yes/To some extent       0.53      0.50      0.51        80
                No       0.77      0.79      0.78       168

          accuracy                           0.69       248
         macro avg       0.65      0.64      0.64       248
      weighted avg       0.69      0.69      0.69       248



Epoch 6/10: 100%|██████████| 557/557 [02:06<00:00,  4.41it/s]


Epoch 6 Loss: 0.4361
Validation Accuracy: 0.7702  Validation Macro F1: 0.6919
                    precision    recall  f1-score   support

Yes/To some extent       0.77      0.41      0.54        80
                No       0.77      0.94      0.85       168

          accuracy                           0.77       248
         macro avg       0.77      0.68      0.69       248
      weighted avg       0.77      0.77      0.75       248



Epoch 7/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 7 Loss: 0.4214
Validation Accuracy: 0.7742  Validation Macro F1: 0.6956
                    precision    recall  f1-score   support

Yes/To some extent       0.79      0.41      0.54        80
                No       0.77      0.95      0.85       168

          accuracy                           0.77       248
         macro avg       0.78      0.68      0.70       248
      weighted avg       0.78      0.77      0.75       248



Epoch 8/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 8 Loss: 0.4084
Validation Accuracy: 0.7944  Validation Macro F1: 0.7113
                    precision    recall  f1-score   support

Yes/To some extent       0.91      0.40      0.56        80
                No       0.77      0.98      0.87       168

          accuracy                           0.79       248
         macro avg       0.84      0.69      0.71       248
      weighted avg       0.82      0.79      0.77       248



Epoch 9/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 9 Loss: 0.4067
Validation Accuracy: 0.7621  Validation Macro F1: 0.6845
                    precision    recall  f1-score   support

Yes/To some extent       0.73      0.41      0.53        80
                No       0.77      0.93      0.84       168

          accuracy                           0.76       248
         macro avg       0.75      0.67      0.68       248
      weighted avg       0.76      0.76      0.74       248



Epoch 10/10: 100%|██████████| 557/557 [02:06<00:00,  4.40it/s]


Epoch 10 Loss: 0.3904
Validation Accuracy: 0.7661  Validation Macro F1: 0.6915
                    precision    recall  f1-score   support

Yes/To some extent       0.74      0.42      0.54        80
                No       0.77      0.93      0.84       168

          accuracy                           0.77       248
         macro avg       0.76      0.68      0.69       248
      weighted avg       0.76      0.77      0.75       248

