In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset/IMDB Dataset.csv


In [None]:
!pip install transformers datasets


In [36]:
df=pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv").rename(columns={"review": "text"})

**ROBERT**

In [37]:
import torch
import transformers
import pandas as pd
import numpy as np

from sklearn import model_selection, metrics

In [38]:
# Configuration settings
config = {
    "max_length": 360,
    "model_path": "FacebookAI/roberta-base",  # Use RoBERTa model
    "output_dir": "./roberta-model",
    "train_batch_size": 32,  # Adjusted batch size
    "valid_batch_size": 32,
    "learning_rate": 2e-5,  # Adjusted learning rate
    "epochs": 4,  # Increased epochs
    "debug": True,
}

In [39]:
# Dataset class
class TextDataset:
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        enc = self.tokenizer.encode_plus(
            row["text"],
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "label": torch.tensor(row["label"]),
        }


In [40]:
id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

if config["debug"]:
    print("DEBUG MODE!")
    df = df.sample(1000, random_state=123)

print(df.shape)
df.head()

DEBUG MODE!
(1000, 3)


Unnamed: 0,text,sentiment,label
11872,"This movie was beyond awful, it was a pimple o...",negative,0
40828,As of this writing John Carpenter's 'Halloween...,positive,1
36400,I must admit a slight disappointment with this...,positive,1
5166,Oh dear! The BBC is not about to be knocked of...,negative,0
30273,its a totally average film with a few semi-alr...,negative,0


In [41]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])

In [42]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=123,
    shuffle=True,
    stratify=df["label"]
)

In [43]:
train_ds = TextDataset(train, tokenizer, config["max_length"])
valid_ds = TextDataset(valid, tokenizer, config["max_length"])
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    config["model_path"],
    num_labels=len(id2label)
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted labels from logits
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [45]:
training_args = transformers.TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["valid_batch_size"],
    learning_rate=config["learning_rate"],
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=500,  # Adjust the number of steps for evaluation
    save_strategy="steps",  # Save model every few steps
    save_steps=500,  # Ensure it aligns with eval_steps
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=100,
)

In [46]:
# Initialize Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
)


In [47]:
trainer.train()
trainer.evaluate()

Step,Training Loss,Validation Loss


{'eval_loss': 0.3309495449066162,
 'eval_accuracy': 0.875,
 'eval_f1': 0.8768472906403942,
 'eval_precision': 0.8725490196078431,
 'eval_recall': 0.8811881188118812,
 'eval_runtime': 2.6401,
 'eval_samples_per_second': 75.756,
 'eval_steps_per_second': 2.651,
 'epoch': 4.0}

**BERT**

In [14]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

In [15]:
config = {
    "model_path": "bert-base-uncased",  # Use BERT model
    "output_dir": "./bert-model",  # Output directory for the model
    "train_batch_size": 8,  # Adjusted batch size
    "valid_batch_size": 8,
    "learning_rate": 2e-5,  # Learning rate
    "epochs": 3,  # Number of epochs
    "max_length": 360,
    "debug": True,
}

In [16]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
df = pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv").rename(columns={"review": "text", "sentiment": "label"})
df['label'] = df['label'].apply(lambda x: 1 if x == "positive" else 0)

if config["debug"]:
    df = df.sample(1000, random_state=42) 

In [18]:
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=config['max_length'])

In [19]:
#Dataset Conversion and Tokenization
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


In [29]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted labels from logits
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [30]:
training_args = TrainingArguments(
    output_dir=config['output_dir'],
    evaluation_strategy="epoch",
    learning_rate=config['learning_rate'],
    per_device_train_batch_size=config['train_batch_size'],
    per_device_eval_batch_size=config['valid_batch_size'],
    num_train_epochs=config['epochs'],
    weight_decay=0.01,
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.480553,0.91,0.901099,0.953488,0.854167
2,No log,0.467462,0.905,0.898396,0.923077,0.875
3,No log,0.478899,0.91,0.90625,0.90625,0.90625


{'eval_loss': 0.4788985848426819,
 'eval_accuracy': 0.91,
 'eval_f1': 0.90625,
 'eval_precision': 0.90625,
 'eval_recall': 0.90625,
 'eval_runtime': 2.5331,
 'eval_samples_per_second': 78.955,
 'eval_steps_per_second': 9.869,
 'epoch': 3.0}