In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
!pip install --upgrade transformers datasets accelerate evaluate -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset

# --------------------
# CONFIG
# --------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MODELS = [
    "bert-base-uncased",
    "roberta-base",
    "microsoft/deberta-v3-base",
    "google/electra-base-discriminator",
    "distilbert-base-uncased"
]
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

# --------------------
# LOAD DATA (Kaggle Path)
# --------------------
print("Loading data...")
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
df = pd.read_csv(data_path)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# --------------------
# TRAIN/VAL/TEST SPLIT (Subset for model comparison)
# --------------------
# Use smaller subset (e.g. 4000) for faster runs if needed
df_subset, _ = train_test_split(df, train_size=8000, stratify=df['label'], random_state=SEED)
train_df, temp_df = train_test_split(df_subset, train_size=0.7, stratify=df_subset['label'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, train_size=0.5, stratify=temp_df['label'], random_state=SEED)

# Convert to Dataset objects
def df_to_dataset(dataframe, tokenizer):
    return Dataset.from_pandas(dataframe).map(
        lambda x: tokenizer(x['review'], truncation=True, padding="max_length", max_length=MAX_LENGTH),
        batched=True
    )

# --------------------
# METRICS FUNCTION
# --------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average='macro')
    rec = recall_score(labels, preds, average='macro')
    return {"f1": f1, "accuracy": acc, "precision": prec, "recall": rec}

# --------------------
# TRAINING LOOP FOR SUBSET EXPERIMENTS
# --------------------
results = []
for model_name in MODELS:
    print(f"\n=== Training {model_name} on subset ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = df_to_dataset(train_df, tokenizer)
    val_dataset = df_to_dataset(val_df, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,   # avoid disk overflow
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=SEED,
        logging_dir=f"./logs/{model_name.replace('/', '_')}",
        logging_steps=50,
        report_to="none",
        fp16=True,  # mixed precision for faster GPU training
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,  # replaces deprecated tokenizer arg
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    results.append({"model": model_name, "f1": metrics["eval_f1"], "accuracy": metrics["eval_accuracy"]})

# --------------------
# SELECT BEST MODEL
# --------------------
results_df = pd.DataFrame(results)
print("\n=== Subset Results ===")
print(results_df)
best_model_name = results_df.sort_values(by="f1", ascending=False).iloc[0]['model']
print(f"\nBest model based on F1: {best_model_name}")

# --------------------
# FULL DATA TRAINING
# --------------------
print("\n=== Training best model on full dataset ===")
tokenizer = AutoTokenizer.from_pretrained(best_model_name)

full_train_df, full_test_df = train_test_split(df, train_size=0.8, stratify=df['label'], random_state=SEED)
full_train_dataset = df_to_dataset(full_train_df, tokenizer)
full_test_dataset = df_to_dataset(full_test_df, tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./final/{best_model_name.replace('/', '_')}",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,   # limit checkpoints
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=SEED,
    logging_dir=f"./logs/final_{best_model_name.replace('/', '_')}",
    logging_steps=50,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
final_metrics = trainer.evaluate()
print("\n=== Final Model Evaluation on Test Set ===")
print(final_metrics)

# --------------------
# INFERENCE ON 10 RANDOM TEST REVIEWS
# --------------------
print("\n=== Inference on 10 random samples ===")
sample_df = full_test_df.sample(10, random_state=SEED)
sample_dataset = Dataset.from_pandas(sample_df).map(
    lambda x: tokenizer(x['review'], truncation=True, padding="max_length", max_length=MAX_LENGTH),
    batched=True
)

preds = trainer.predict(sample_dataset)
pred_labels = np.argmax(preds.predictions, axis=-1)
pred_probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()

for i, row in enumerate(sample_df.itertuples()):
    print(f"\nReview: {row.review[:300]}...")
    print(f"True Label: {row.label} | Predicted: {pred_labels[i]} | Confidence: {pred_probs[i][pred_labels[i]]:.4f}")


2025-09-07 16:06:00.153612: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757261160.468927      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757261160.562569      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading data...

=== Training bert-base-uncased on subset ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.2816,0.279087,0.892428,0.8925,0.893551,0.8925
2,0.1637,0.28191,0.903236,0.903333,0.904958,0.903333
3,0.0793,0.335243,0.914165,0.914167,0.914195,0.914167





=== Training roberta-base on subset ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.221,0.234234,0.92332,0.923333,0.923635,0.923333
2,0.1579,0.264354,0.927482,0.9275,0.927929,0.9275
3,0.1056,0.279251,0.932492,0.9325,0.932703,0.9325





=== Training microsoft/deberta-v3-base on subset ===


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.2188,0.176945,0.943333,0.943333,0.943353,0.943333
2,0.1444,0.164314,0.949999,0.95,0.950045,0.95
3,0.0711,0.213635,0.9475,0.9475,0.947501,0.9475





=== Training google/electra-base-discriminator on subset ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.2264,0.203169,0.932486,0.9325,0.932847,0.9325
2,0.1602,0.211562,0.940828,0.940833,0.940982,0.940833
3,0.1069,0.24603,0.934997,0.935,0.935077,0.935





=== Training distilbert-base-uncased on subset ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.3104,0.292334,0.886659,0.886667,0.886774,0.886667
2,0.2067,0.284143,0.889985,0.89,0.890212,0.89
3,0.128,0.32073,0.888263,0.888333,0.889307,0.888333





=== Subset Results ===
                               model        f1  accuracy
0                  bert-base-uncased  0.914165  0.914167
1                       roberta-base  0.932492  0.932500
2          microsoft/deberta-v3-base  0.949999  0.950000
3  google/electra-base-discriminator  0.940828  0.940833
4            distilbert-base-uncased  0.889985  0.890000

Best model based on F1: microsoft/deberta-v3-base

=== Training best model on full dataset ===




Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.1633,0.14301,0.943188,0.9432,0.943589,0.9432
2,0.116,0.213729,0.948699,0.9487,0.948733,0.9487
3,0.2425,0.697907,0.949596,0.9496,0.949733,0.9496





=== Final Model Evaluation on Test Set ===
{'eval_loss': 0.6979074478149414, 'eval_f1': 0.9495962721402875, 'eval_accuracy': 0.9496, 'eval_precision': 0.9497330490252236, 'eval_recall': 0.9496, 'eval_runtime': 128.9317, 'eval_samples_per_second': 77.56, 'eval_steps_per_second': 2.428, 'epoch': 3.0}

=== Inference on 10 random samples ===


Map:   0%|          | 0/10 [00:00<?, ? examples/s]




Review: This is loosely based on the ideas of the original 80's hit . It's set in the modern day as we see a base in Afghanistan get destroyed by a UAV right at the start.<br /><br />And that's exactly where the movie jumps the shark. UAV's aren't armed. They could be but I don't think it's ever been tried ...
True Label: 0 | Predicted: 0 | Confidence: 1.0000

Review: This movie was on British TV last night, and is wonderful! Strong women, great music (most of the time) and just makes you think. We do have stereotypes of what older people "ought" to do, and there are fantastic cameos of the "sensible but worried children". Getting near to my best movie ever !...
True Label: 1 | Predicted: 1 | Confidence: 1.0000

Review: What a dreadful movie. The effects were poor, especially by todays standards, but that was forgivable. What was unforgivable was the terrible rehashing of every flood/dam breaks disaster movie ever made into this piece of trash. The acting was awful and I mean AWFUL. T