# Lab HW 4

# Question
## Train an RNN and an LSTM model for two different tasks:
  - ## Task 1: Language Modeling
  - ## Task 2: Sentiment Analysis

## Compare the performance of RNN and LSTM models for each task using suitable evaluation metrics.
## For example, compare the perplexity values in case of language modeling; and accuracy, F1 score for sentiment analysis.

# Language Modeling Task

- Language modeling means using the various statistical/probabilistic techniques to determine the probability of a given sequence of words occurring in a sentence.

- For this, I have used the [Movie Plots](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots) datasets taken directly from wikipedia. However, The Dataset was too large so i have only used 15 randomly selected movies for training and testing of the RNN and LSTM models.

In [None]:
# importing all the required modules and libraries

from transformers import BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import zipfile
import pandas as pd
from random import sample

In [None]:
# importing the movie plots dataset
!kaggle datasets download -d jrobischon/wikipedia-movie-plots

In [None]:
# unzupping the file and reading data from it
path_to_zip_file = "/content/wikipedia-movie-plots.zip"
directory_to_extract_to = "/content/"

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
  zip_ref.extractall(directory_to_extract_to)

df = pd.read_csv("/content/wiki_movie_plots_deduped.csv")
df.info()

In [None]:
# selecting 15 random movies to be used
random_seq= df["Plot"].sample(15)
train_sen, test_sen= random_seq[:12], random_seq[12:]
train_txt= ""
for i in train_sen:
  train_txt += i + " "

test_txt= ""
for i in test_sen:
  test_txt += i + " "

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_tokens = tokenizer.tokenize(train_txt)
test_tokens= tokenizer.tokenize(test_txt)

# Convert tokens to IDs
train_ids = tokenizer.convert_tokens_to_ids(train_tokens)
test_ids = tokenizer.convert_tokens_to_ids(test_tokens)

In [None]:
vocab_size = len(tokenizer.vocab)
vocab_size

In [None]:
# here if i have created the usage class over the Dataset class to customize the __getitem__ and __len__ methods to fit my dataset
class Usage( Dataset ):
  def __init__(self, data, length):
    self.data = torch.tensor(data, dtype=torch.long)
    self.length = length

  def __getitem__(self, idx):
    return (self.data[idx:idx + self.length],self.data[idx + 1:idx + self.length + 1])

  def __len__(self):
    return len( self.data ) - self.length

# initilising the size of sequence and the number of sequences in a batch
seq_len = 30; batch_size = 64

train_dataset = Usage( train_ids, seq_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True )

In [None]:
# here i have defined classes for the language model
class RNN_LM(nn.Module):

    def __init__(self, vsize, embedding_size, hidden_size, layers):

        super(RNN_LM, self).__init__()
        self.embedding = nn.Embedding(vsize, embedding_size)
        self.rnn = nn.RNN(embed_size, hidden_size, layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vsize)

    def forward(self, x, h):
        x = self.embedding(x)
        out, h = self.rnn(x, h)
        out = self.fc(out)
        return out, h

class LSTM_LM(nn.Module):

    def __init__(self, vsize, embedding_size, hidden_size, layers):
        super(LSTM_LM, self).__init__()
        self.embedding = nn.Embedding(vsize, embedding_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vsize)

    def forward(self, x, h):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x, h)
        out = self.fc(out)
        return out, (h, c)

In [None]:
def train(model, dataloader, vsize, n_epochs=4, learning_rate= 0.0004):

    loss_criterion = nn.CrossEntropyLoss();  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(n_epochs):
        total_loss = 0

        for batch, (input_data, target_data) in enumerate(dataloader):
            optimizer.zero_grad()

            h = None
            output, h = model(input_data, h)

            loss = loss_criterion(output.view(-1, vocab_size), target_data.view(-1))
            loss.backward();  optimizer.step()
            total_loss += loss.item()

            print(f'Epoch [{epoch+1}/{n_epochs}], Step [{batch}/{len(dataloader)}], Loss: {loss.item():.4f}')

        print(f'Epoch [{epoch+1}/{n_epochs}], Average Loss till now: {total_loss / len(dataloader):.4f}')


def find_perplexity(model, data_loader, vocab_size):
    model.eval()
    # setting and criterion as the cross entropy loss
    total_loss = 0; criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for input_data, target_data in data_loader:
            output, _ = model(input_data, None)
            loss = criterion(output.view(-1, vocab_size), target_data.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return np.exp(avg_loss)

In [None]:
test_dataset = Usage(test_ids, seq_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

embed_size = 128; hidden_size = 256
layers = 2

# Finding Perplexity for RNN Model: This will take around minutes for running as the plot for particular movie is generally very large in wikipedia.

In [None]:
# Train RNN Model
rnn_model = RNN_LM(vocab_size, embed_size, hidden_size, layers)
train(rnn_model, train_loader, vocab_size)

# Perplexity for RNN
rnn_perplexity = find_perplexity(rnn_model, test_loader, vocab_size)
print(f'\nRNN Perplexity:\n {rnn_perplexity:.4f}')

# Finding Perplexity for the LSTM model

In [None]:
# Train LSTM Model
lstm_model = LSTM_LM(vocab_size, embed_size, hidden_size, layers)
train(lstm_model, train_loader, vocab_size)

# Perplexity for LSTM
lstm_perplexity = find_perplexity(lstm_model, test_loader, vocab_size)
print(f'LSTM Perplexity: {lstm_perplexity:.4f}')

# Results
- RNN gave lesser value of Perplexity for the Language Modeling task than LSTM.
- The Perplexity can be further improved if we use the complete dataset instead of using merely 30 sentences but that would have taken a lot of amount of time.

# Sentiment Analysis

- Here, We try to obtain the different sentiments related to different texts.
- For this, I have used the [sentiment analysis](https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset) dataset from kaggle.

In [None]:
# downloading the dataset from kaggle
!kaggle datasets download -d abhi8923shriv/sentiment-analysis-dataset

In [None]:
# unzipping the file and separating the train and test data
path_to_zip_file = "/content/sentiment-analysis-dataset.zip"
directory_to_extract_to = "/content/"

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
  zip_ref.extractall(directory_to_extract_to)

tottrain_dt= pd.read_csv('train.csv', encoding= 'latin1');
tottest_dt= pd.read_csv('test.csv', encoding='latin1', skip_blank_lines= True)

print(f"The lenth of the train set and test set are {len(tottrain_dt)} and {len(tottest_dt)}")

In [None]:
# converting the data to a proper format
train_input= tottrain_dt['text'].to_numpy()
train_target_ini= tottrain_dt['sentiment'].to_numpy()
test_input= tottest_dt['text'].to_numpy()
test_target_ini= tottest_dt['sentiment'].to_numpy()

map_label= {'positive': 2, 'negative': 0, 'neutral': 1, np.nan: 1}
map_fun= np.vectorize( lambda x: map_label[x] )
train_target= map_fun(train_target_ini)
test_target= map_fun(test_target_ini)

In [None]:
from transformers import BertTokenizer

# making the class Usage_SA to convert the initial data to proper formats
class Usage_SA(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        # Ensure text is a string
        if not isinstance(text, str):
            text = str(text)

        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the pre-trained Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 50;  batch_size = 32

train_data = Usage_SA(train_input, train_target, tokenizer, max_length)
trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = Usage_SA(test_input, test_target, tokenizer, max_length)
testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
# Creating the RNN and LSTM classes for using neural network module of pytorch
class RNN_SA(nn.Module):

    def __init__(self, vsize, embedding_size, hsize, osize):

        super(RNN_SA, self).__init__()
        self.embedding = nn.Embedding(vsize, embedding_size)
        self.rnn = nn.RNN(embedding_size, hsize, batch_first=True)
        self.fc = nn.Linear(hsize, osize)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        rnn_out, _ = self.rnn(embedded)
        final_hidden_state = rnn_out[:, -1, :]
        output = self.fc(final_hidden_state)
        return output


class LSTM_SA(nn.Module):
    def __init__(self, vsize, embedding_size, hsize, osize):

        super( LSTM_SA, self).__init__()
        self.embedding = nn.Embedding(vsize, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hsize, osize)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        final_hidden_state = lstm_out[:, -1, :]
        output = self.fc(final_hidden_state)
        return output

In [None]:
# training the model
def train(model, dataloader, num_epochs=4, lr=0.0001):
    loss_criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for num, batch in enumerate(dataloader):
            optimizer.zero_grad()

            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            outputs = model(input_ids, attention_mask)
            loss = loss_criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            print(f'Epoch [{epoch + 1}/{num_epochs}], Step: [{num}/{len(dataloader)}], Loss: {total_loss / len(dataloader):.4f}')

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}')


def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, f1

# Define model parameters
vocab_size = tokenizer.vocab_size
embed_size = 128
hidden_size = 256
output_size = 3

# Training and Evaluating the RNN model

In [None]:
# Train RNN Model
rnn_model = RNN_SA( vocab_size, embed_size, hidden_size, output_size)
train(rnn_model, trainloader)

# Evaluate RNN Model
rnn_accuracy, rnn_f1 = evaluate_model(rnn_model, testloader)
print(f'RNN Model Accuracy: {rnn_accuracy:.4f}, F1 Score: {rnn_f1:.4f}')

# Training and Evaluating the LSTM model

In [None]:
# Train LSTM Model
lstm_model = LSTM_SA(vocab_size, embed_size, hidden_size, output_size)
train(lstm_model, trainloader)

# Evaluate LSTM Model
lstm_accuracy, lstm_f1 = evaluate_model(lstm_model, testloader)
print(f'LSTM Model Accuracy: {lstm_accuracy:.4f}, F1 Score: {lstm_f1:.4f}')

# Results

- For the task of Sentiment Analysis, both of the model gave similar scores, however this score can be further increased by performing data precprocessing on the data.