In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade transformers datasets accelerate evaluate -q


In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset

# --------------------
# CONFIG
# --------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MODELS = [
    "bert-base-uncased",
    "roberta-base",
    "microsoft/deberta-v3-base",
    "google/electra-base-discriminator",
    "distilbert-base-uncased"
]
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

# --------------------
# LOAD DATA (Kaggle Path)
# --------------------
print("Loading data...")
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
df = pd.read_csv(data_path)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# --------------------
# TRAIN/VAL/TEST SPLIT (Subset for model comparison)
# --------------------
df_subset, _ = train_test_split(df, train_size=8000, stratify=df['label'], random_state=SEED)
train_df, temp_df = train_test_split(df_subset, train_size=0.7, stratify=df_subset['label'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, train_size=0.5, stratify=temp_df['label'], random_state=SEED)

# Convert to Dataset objects
def df_to_dataset(dataframe, tokenizer):
    return Dataset.from_pandas(dataframe).map(
        lambda x: tokenizer(x['review'], truncation=True, padding="max_length", max_length=MAX_LENGTH),
        batched=True
    )

# --------------------
# METRICS FUNCTION
# --------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average='macro')
    rec = recall_score(labels, preds, average='macro')
    return {"f1": f1, "accuracy": acc, "precision": prec, "recall": rec}

# --------------------
# TRAINING LOOP FOR SUBSET EXPERIMENTS
# --------------------
results = []
for model_name in MODELS:
    print(f"\n=== Training {model_name} on subset ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = df_to_dataset(train_df, tokenizer)
    val_dataset = df_to_dataset(val_df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=SEED,
        logging_dir=f"./logs/{model_name.replace('/', '_')}",
        logging_steps=50,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    results.append({"model": model_name, "f1": metrics["eval_f1"], "accuracy": metrics["eval_accuracy"]})

# --------------------
# SELECT BEST MODEL
# --------------------
results_df = pd.DataFrame(results)
print("\n=== Subset Results ===")
print(results_df)
best_model_name = results_df.sort_values(by="f1", ascending=False).iloc[0]['model']
print(f"\nBest model based on F1: {best_model_name}")

# --------------------
# FULL DATA TRAINING
# --------------------
print("\n=== Training best model on full dataset ===")
tokenizer = AutoTokenizer.from_pretrained(best_model_name)

full_train_df, full_test_df = train_test_split(df, train_size=0.8, stratify=df['label'], random_state=SEED)
full_train_dataset = df_to_dataset(full_train_df, tokenizer)
full_test_dataset = df_to_dataset(full_test_df, tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./final/{best_model_name.replace('/', '_')}",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=SEED,
    logging_dir=f"./logs/final_{best_model_name.replace('/', '_')}",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
final_metrics = trainer.evaluate()
print("\n=== Final Model Evaluation on Test Set ===")
print(final_metrics)

# --------------------
# INFERENCE ON 10 RANDOM TEST REVIEWS
# --------------------
print("\n=== Inference on 10 random samples ===")
sample_df = full_test_df.sample(10, random_state=SEED)
sample_dataset = Dataset.from_pandas(sample_df).map(
    lambda x: tokenizer(x['review'], truncation=True, padding="max_length", max_length=MAX_LENGTH),
    batched=True
)

preds = trainer.predict(sample_dataset)
pred_labels = np.argmax(preds.predictions, axis=-1)
pred_probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()

for i, row in enumerate(sample_df.itertuples()):
    print(f"\nReview: {row.review[:300]}...")
    print(f"True Label: {row.label} | Predicted: {pred_labels[i]} | Confidence: {pred_probs[i][pred_labels[i]]:.4f}")


Loading data...

=== Training bert-base-uncased on subset ===


Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.2804,0.276317,0.895781,0.895833,0.896637,0.895833
2,0.163,0.276997,0.904955,0.905,0.905762,0.905
3,0.0755,0.315596,0.907497,0.9075,0.907555,0.9075





=== Training roberta-base on subset ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.2248,0.244903,0.9225,0.9225,0.922511,0.9225
2,0.1695,0.24415,0.924155,0.924167,0.924432,0.924167
3,0.0997,0.25956,0.932489,0.9325,0.93277,0.9325





=== Training microsoft/deberta-v3-base on subset ===


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.202,0.177827,0.941656,0.941667,0.941981,0.941667
2,0.1445,0.162669,0.952498,0.9525,0.952562,0.9525
3,0.0612,0.187746,0.949165,0.949167,0.949228,0.949167





=== Training google/electra-base-discriminator on subset ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]