In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import necessary libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [None]:
# Load data
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. The filming tec...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [None]:
# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Huggingface Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['review'], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1368,0.234882
2,0.0395,0.265511
3,0.0078,0.369726


TrainOutput(global_step=75, training_loss=0.05126793871323267, metrics={'train_runtime': 186.6912, 'train_samples_per_second': 6.428, 'train_steps_per_second': 0.402, 'total_flos': 349411481518080.0, 'train_loss': 0.05126793871323267, 'epoch': 3.0})

In [None]:
trainer.evaluate()

In [None]:
from sklearn.metrics import classification_report

# Get predictions from the trained model on the test set
predictions = trainer.predict(test_dataset)

# The predictions are in the 'predictions' attribute, taking the max for classification
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get the true labels from the test dataset
true_labels = test_dataset['label']

# Print the classification report
print(classification_report(true_labels, predicted_labels))

In [None]:
# Change parameters to Improve the model

In [None]:
from transformers import EarlyStoppingCallback

# Example: Adjust learning rate and add early stopping
training_args = TrainingArguments(
    output_dir='./results_tuned', # Use a different output directory
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5, # Lower learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5, # More epochs, with early stopping
    weight_decay=0.01,
    logging_dir='./logs_tuned',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Monitor eval_loss for best model
    greater_is_better=False, # Lower loss is better
    report_to="none"
)

In [None]:
# Add Early Stopping Callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01) # Stop if accuracy doesn't improve by 0.01 for 3 epochs

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:

from sklearn.metrics import classification_report

# Get predictions from the trained model on the test set
predictions = trainer.predict(test_dataset)

# The predictions are in the 'predictions' attribute, taking the max for classification
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get the true labels from the test dataset
true_labels = test_dataset['label']

# Print the classification report
print(classification_report(true_labels, predicted_labels))