In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [None]:
import os
import random
import numpy as np
import pandas as pd

from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import f1_score, accuracy_score

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
    pipeline
)
from tqdm.auto import tqdm

2025-09-05 15:04:53.881708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757084694.209172      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757084694.309740      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:

seed = 42
set_seed(seed)
subset_size = 3000
batch_size = 16
compare_epochs = 2
full_epochs = 2

models_to_try = [
"bert-base-uncased",
"roberta-base",
"google/electra-small-discriminator",
"distilbert-base-uncased"
]

In [None]:

def load_imdb():
    try:
        df = pd.read_csv("IMDB Dataset.csv")
        df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
    except FileNotFoundError:
        ds = load_dataset("imdb")
        df = pd.concat([
            pd.DataFrame({"review": ds["train"]["text"], "label": ds["train"]["label"]}),
            pd.DataFrame({"review": ds["test"]["text"], "label": ds["test"]["label"]})
        ]).sample(frac=1, random_state=seed).reset_index(drop=True)
    return df

In [None]:

def make_dataset(df, n):
    df = df.sample(n=n, random_state=seed).reset_index(drop=True)
    n_train = int(0.8*n)
    n_val = int(0.1*n)
    return DatasetDict({
        "train": Dataset.from_pandas(df.iloc[:n_train]),
        "validation": Dataset.from_pandas(df.iloc[n_train:n_train+n_val]),
        "test": Dataset.from_pandas(df.iloc[n_train+n_val:])
    })


In [None]:

def tokenize_with_progress(ds, tokenizer):
    return ds.map(
        lambda x: tokenizer(x["review"], truncation=True),
        batched=True,
        remove_columns=["review"],
        desc="Tokenizing dataset"
    )

# Custom metric
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        "f1": f1_score(labels, preds),
        "accuracy": accuracy_score(labels, preds)
    }

In [None]:

df_all = load_imdb()
ds_small = make_dataset(df_all, subset_size)
results = []

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
for model_name in models_to_try:
    print(f"Training {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized = tokenize_with_progress(ds_small, tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = Trainer(
        model=model,
        args=TrainingArguments(
        output_dir=f"./{model_name}-compare",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=compare_epochs,
        logging_steps=10,              # log more frequently
        logging_first_step=True,       # log the very first step
        report_to="none",              # disable W&B/MLflow noise
        save_total_limit=1
    ),

        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )
    trainer.train()
    eval_metrics = trainer.evaluate(tokenized["validation"])
    results.append({"model": model_name, "f1": eval_metrics["eval_f1"]})
    del model
    torch.cuda.empty_cache()

best_model = max(results, key=lambda x: x["f1"])["model"]
print("Best model:", best_model)

Training bert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
1,0.6669
10,0.6775
20,0.5612
30,0.3296
40,0.2939
50,0.284
60,0.2355
70,0.4423
80,0.2249
90,0.199




Training roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
1,0.7302
10,0.6777
20,0.4459
30,0.2714
40,0.2484
50,0.3057
60,0.2096
70,0.4133
80,0.1497
90,0.1685




Training google/electra-small-discriminator...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizing dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Step,Training Loss
1,0.691
10,0.6919
20,0.6882
30,0.6792
40,0.6338
50,0.5814
60,0.5508
70,0.5367
80,0.4683
90,0.4544




Training distilbert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
1,0.6784
10,0.6911
20,0.5431
30,0.3303
40,0.3307
50,0.3884
60,0.3154
70,0.4828
80,0.2609
90,0.2158




Best model: roberta-base


In [None]:

df = df_all.sample(frac=1, random_state=seed).reset_index(drop=True)
n = len(df)
train_end, val_end = int(0.8*n), int(0.9*n)
full_ds = DatasetDict({
"train": Dataset.from_pandas(df.iloc[:train_end]),
"validation": Dataset.from_pandas(df.iloc[train_end:val_end]),
"test": Dataset.from_pandas(df.iloc[val_end:])
})


tokenizer = AutoTokenizer.from_pretrained(best_model)
full_tokenized = tokenize_with_progress(full_ds, tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(best_model, num_labels=2)

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=f"./{best_model}-full",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=full_epochs,   # use full_epochs, not compare_epochs
        logging_steps=10,
        logging_first_step=True,
        report_to="none",
        save_total_limit=1
    ),
    train_dataset=full_tokenized["train"],
    eval_dataset=full_tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)



trainer.train()
print("Final evaluation:", trainer.evaluate(full_tokenized["test"]))

Tokenizing dataset:   0%|          | 0/40000 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
1,0.7242
10,0.7191
20,0.4344
30,0.3718
40,0.266
50,0.4351
60,0.27
70,0.3352
80,0.2593
90,0.3077




Final evaluation: {'eval_loss': 0.1690092533826828, 'eval_f1': 0.9538277033779733, 'eval_accuracy': 0.9538, 'eval_runtime': 85.6038, 'eval_samples_per_second': 58.409, 'eval_steps_per_second': 1.834, 'epoch': 2.0}


In [None]:

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
sample = df.iloc[val_end:].sample(10, random_state=seed)

print("\nRunning inference on 10 samples...\n")
for _, row in tqdm(sample.iterrows(), total=len(sample), desc="Inference"):
    pred = pipe(row["review"][:1000])
    print("Review:", row["review"][:200].replace("\n", " "))
    print("True:", row["label"], "Pred:", pred, "\n")

Device set to use cuda:0



Running inference on 10 samples...



Inference:   0%|          | 0/10 [00:00<?, ?it/s]

Review: Had this been the original 1914 version of TESS OF THE STORM COUNTRY (also starring Mary Pickford), I probably would have rated it a lot higher, as this sort of extreme melodrama and sentimentality wa
True: 1 Pred: [{'label': 'LABEL_1', 'score': 0.9936874508857727}] 

Review: If you have trouble suspending disbelief then this isn't for you. Consider: a woman already in late middle age finds a newborn baby in a cabbage patch and raises it as her own. Think about it; she mak
True: 1 Pred: [{'label': 'LABEL_1', 'score': 0.8096235394477844}] 

Review: This movie was packed pull of endless surprises! Just when you thought it couldn't get worse, they added more joints and more pink fuzzy-lined vans with raunchy sex scenes. As you can guess, I was a v
True: 0 Pred: [{'label': 'LABEL_1', 'score': 0.8809909224510193}] 

Review: San Francisco is a big city with great acting credits. In this one, the filmmakers made no attempt to use the city. They didn't even manage the most basic of rea