# GPT-2 for Sentiment Analysis on IMDb movie reviews

## Table of Contents
1. [Introduction](##Introduction)
2. [Data exploration](##Data-Exploration)
3. [Zero Shot Classification](##Zero-shot-classification)

## Introduction

The [IMDb](https://ai.stanford.edu/~amaas/data/sentiment/) is a binary sentiment classification dataset consisting of 100k movie reviews(50k positive and 50k negative). The dataset is split into train and test containing 50k reviews each.

In this notebook, my goals are:
1. Understand and implement [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf). Run GPT-2 on the IMDb classification task.
2. Fine-tune GPT-2 for sentiment classification in under ~30 minutes on a 8GB Apple M2 macbook air (Faster if you have a Nvidia GPU).
3. Understand how [LoRA](https://arxiv.org/abs/2106.09685) is implemented and use it to fine-tune GPT-2 for sentiment classification.

## Data-Exploration
Get a summary of the dataset. i.e
1. No of samples
2. No of positive / negative samples.
3. Length of the movie reviews



In [None]:
import pandas
from torch.utils.data import Dataset
from sentiment_classification.reviewsDataset import reviewsDataset


In [None]:
# Dataset exploration

imdb_train = reviewsDataset("train",max_length=10000)
imdb_test = reviewsDataset("test",max_length=10000)


def format_data(dataset: Dataset) -> pandas.DataFrame:

    data = []
    for batch in dataset:
        data.append({"input_ids":len(batch["input_ids"]),
                    "label": batch["label"],
                    "filename": batch["fpath"]})
    
    return pandas.DataFrame(data)

train_data = format_data(imdb_train)
test_data = format_data(imdb_test)


*Summary statistics of the dataset*

In [None]:
def summary(data: pandas.DataFrame) -> None:
    print(f"Number of reviews: {len(data)}")
    print(f"Positive Reviews: {data[data['label'] == 1]['label'].count()}")
    print(f"Negative Reviews: {data[data['label'] == 0]['label'].count()}")
    print(f"Max Review Length: {data['input_ids'].max()}\nMin Review Length: {data['input_ids'].min()}")
    print(f"Median Review Length: {data['input_ids'].median()}\nMean Review Length: {data['input_ids'].mean()}")

print("Train\n--------------")
summary(train_data)
print("Test\n---------------")
summary(test_data)

*Length of reviews (measured by the number of tokens)*

In [None]:
from matplotlib import pyplot as plt
def plot_hist(title: str,df: pandas.DataFrame) -> None:
    plt.figure()
    plt.hist(df["input_ids"],bins=100)
    plt.xlabel(f"No of tokens")
    plt.ylabel("Count")
    plt.title(f"{title}")

plot_hist(title='Train Data', df=train_data) 
plot_hist(title="Test Data", df=test_data)   

In [None]:
plot_hist(title="Positive Reviews Test",df=test_data[test_data['label']==1])
plot_hist(title="Negative Reviews Test",df=test_data[test_data['label']==0])

Run the test.py in `sentiment_classification` and write the results to a file

In [None]:
def get_metrics_by_bin(results, bins,threshold=0.5):
    TP = len(results[(results["label"] >= threshold) & (results["prediction"] >= threshold)])
    FP = len(results[(results["label"] < threshold) & (results["prediction"] >= threshold)])
    TN = len(results[(results["label"] < threshold) & (results["prediction"] < threshold)])
    FN = len(results[(results["label"] > threshold) & (results["prediction"] < threshold)])
    
    print("Metrics")
    print(f"Precision: {TP/(TP+FP)}\nRecall: {TP/(TP+FN)}\nAccuracy: {(TP+TN)/len(results)}")
    bins = range(0,1500,128)
    results["bin"] = pandas.cut(results['length'],bins)
    metrics_by_bin = results.groupby('bin').apply(lambda x: pandas.Series({"TP": ((x["label"] >= threshold) & (x["prediction"] >= threshold)).sum(),
                                                                            "FP":((x["label"] < threshold) & (x["prediction"] >= threshold)).sum(),
                                                                            "FN": ((x["label"] >= threshold) & (x["prediction"] < threshold)).sum(),
                                                                            "TN": ((x["label"] < threshold) & (x["prediction"] < threshold)).sum()}))

    metrics_by_bin["accuracy"] = (metrics_by_bin["TP"] + metrics_by_bin["TN"])/(metrics_by_bin["TP"] + metrics_by_bin["TN"]+ metrics_by_bin["FP"]+ metrics_by_bin["FN"])
    metrics_by_bin["precision"] = metrics_by_bin["TP"]/(metrics_by_bin["TP"] + metrics_by_bin["FP"])
    metrics_by_bin["recall"] = metrics_by_bin["TP"]/(metrics_by_bin["TP"] + metrics_by_bin["FN"])
    print("Metrics by bin")
    print(metrics_by_bin.to_markdown())

Predict the next word given the following prompt
 
'''
Review: The movie was awesome. Sentiment: Positive. 
Review: The performances were disappointing. Sentiment: Negative. 
Review: {review} Sentiment:
'''
I calculate the probabilities of the word " Positive" and " Negative" and classify the review based on which probability is greater.

**Run evaluation for the zero shot approach**

In [None]:
import torch
from sentiment_classification.reviewsDataset import reviewsDataset
from sentiment_classification.eval import Eval
from sentiment_classification.eval_config import EvalConfig
from gpt_config import GPTConfig

torch.manual_seed(1336)

In [None]:
model_config = GPTConfig(block_size=128,use_lora=False)
eval_config = EvalConfig(results_path="zero_shot_128.txt",subset=False)
test_set = reviewsDataset(split="test")
evaluator = Eval(test_set=test_set,eval_config=eval_config,model_config=model_config)
evaluator.evaluate()

In [None]:
res_file = pandas.read_csv("zero_shot_128.txt")
bins = range(0,1500,128)
get_metrics_by_bin(res_file,bins,threshold=0.5)

**Finetuning without LoRA**

In [None]:
import torch
from sentiment_classification.train import Trainer
from sentiment_classification.train_config import TrainConfig
from gpt_config import GPTConfig
from sentiment_classification.reviewsDataset import reviewsDataset

In [None]:
train_config = TrainConfig(out_dir="run",init_from="resume",checkpoint_name="finetune_no_lora.ckpt")
model_config = GPTConfig(use_lora=False)
rd = reviewsDataset(split="train",max_length=model_config.block_size)
train_set, val_set = torch.utils.data.random_split(rd,[0.85,0.15])
trainer = Trainer(train_set,val_set,train_config,model_config)
trainer.train()

**Run eval using the fine-tuned model**

In [None]:
model_config = GPTConfig(block_size=128,use_lora=False,load_from_checkpoint=True,checkpoint_path="run/finetune_no_lora.ckpt")
eval_config = EvalConfig(results_path="finetuned_no_lora.txt",subset=False)
test_set = reviewsDataset(split="test")
evaluator = Eval(test_set=test_set,eval_config=eval_config,model_config=model_config)
evaluator.evaluate()

**Test the performance of the fine-tuned model**

In [None]:
res_file = pandas.read_csv("finetuned_no_lora.txt")
bins = range(0,1500,128)
get_metrics_by_bin(res_file,bins,threshold=0.5)

**Run training using LoRA**

In [None]:
train_config = TrainConfig(out_dir="run",checkpoint_name="finetune_lora.ckpt",always_save_checkpoint=False,max_iters=2000)
model_config = GPTConfig(block_size=128,use_lora=True,r=8)
rd = reviewsDataset(split="train",max_length=model_config.block_size)
train_set, val_set = torch.utils.data.random_split(rd,[0.85,0.15])
trainer = Trainer(train_set,val_set,train_config,model_config)
trainer.train()

**Evaluate using the LoRA finetuned model**

In [None]:
model_config = GPTConfig(use_lora=True,load_from_checkpoint=True,checkpoint_path="run/finetune_lora.ckpt")
eval_config = EvalConfig(results_path="finetuned_lora.txt")
test_set = reviewsDataset(split="test")
evaluator = Eval(test_set=test_set,eval_config=eval_config,model_config=model_config)
evaluator.evaluate()

**Test the performance of the LoRA finetuned model**

In [None]:
res_file = pandas.read_csv("finetuned_lora_256.txt")
bins = range(0,1500,128)
get_metrics_by_bin(res_file,bins,threshold=0.5)