<a href="https://colab.research.google.com/github/uresha1995/Research-Methodology/blob/main/Assignment_2_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install the libraries

!pip install --upgrade transformers datasets

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
import re
import random

In [None]:
#Download the dataset

dataset = load_dataset("imdb")
print(dataset)

In [None]:
#Print  random reviews

sample_texts = [dataset["train"][i]["text"] for i in random.sample(range(25000), 5)]

for i, text in enumerate(sample_texts):
    print(f"--- Sample {i+1} ---\n{text[:500]}\n")

Preprocessing

In [None]:
#Remove empty and short reviews

def rem_empty(example):
    return len(example["text"].strip()) > 20

dataset["train"] = dataset["train"].filter(rem_empty)
dataset["test"] = dataset["test"].filter(rem_empty)

In [None]:
#Removing unnecessary spaces

def clean_text(example):
    example["text"] = example["text"].strip()
    return example

dataset = dataset.map(clean_text)

In [None]:
#Remove HTML break tags and replace with space

def clean_text(example):
    text = example["text"]
    text = re.sub(r"<br\s*/?>", " ", text)
    example["text"] = text
    return example

dataset = dataset.map(clean_text)

In [None]:
#Tokenization

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_fun(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(preprocess_fun, batched=True)

Prepare data for training

In [None]:
#Split dataset into training and test

train_data = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_data = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
#Loading BERT model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
!pip uninstall -y transformers

In [None]:
!pip install --quiet "transformers==4.41.0" "datasets==2.18.0" "evaluate==0.4.1"

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(

    #Save the model outputs
    output_dir="./results",

    #Run evaluation of each epoch
    evaluation_strategy="epoch",

    #Log training progress every few steps
    logging_strategy="steps",

    #Logging to every 50 steps
    logging_steps=50,

    #Standard fine-tuning learning rate
    learning_rate=2e-5,

    #Tranining batch size
    per_device_train_batch_size=16,

    #Batch size for evaluation
    per_device_eval_batch_size=16,

    #Number of epochs
    num_train_epochs=2,

    # Weight decay for regularization
    weight_decay=0.01,

    #where to store logs
    logging_dir="./logs",

    #loading the best model
    load_best_model_at_end=True,

    #Save the epoch end of the model
    save_strategy="epoch",
    report_to="none"
)


In [None]:
from transformers import Trainer, DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

trainer.train()