In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import matplotlib.pyplot as plt
# import wandb
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
import pandas as pd




def map_word_to_index(reviews,size):
    word_counter=Counter([word for review in reviews for word in review])
    common_words=word_counter.most_common(size)
    word_to_index={word:ind+1 for ind,word in enumerate(common_words)}
    word_to_index['<UNK>']=0
    return word_to_index


def preprocessed_dataset(data,labels,vocab_size,max_length):
    stemmer=PorterStemmer()
    tokenized_reviews=[]
    for review in data:
        tokens=word_tokenize(review)
        stemmed_tokens=[stemmer.stem(token)for token in tokens]
        tokenized_reviews.append(stemmed_tokens)
    word_to_index=map_word_to_index(tokenized_reviews,vocab_size)

    ##converting the reviews to word indices and padding them
    indexed_reviews=[]
    for review in tokenized_reviews:
        indexed_review=[word_to_index[word]if word in word_to_index else 0 for word in review]
        indexed_reviews.append(indexed_review)
    ## creating an empty 2 d array
    Padded_reviews=np.zeros((len(indexed_reviews),max_length))
    for ind,review in enumerate(indexed_reviews):
        Padded_reviews[ind, :min(len(review),max_length)]=review[:max_length]

    return Padded_reviews,word_to_index



## defining the architecture of the model



# Load the IMDb dataset from CSV
imdb_data = pd.read_csv('/content/drive/My Drive/SMAI_A3/IMDB Dataset.csv')
reviews=imdb_data["review"]
labels=imdb_data["sentiment"]



## parameters
max_length=300
hidden_size=128
output_size=1
batch_size=64
num_epochs=10
learning_rate=0.0001
word_embedding_dimension=1024
# ### data preprocessing
vocab_size=15000
preprocessed_data,word_to_index=preprocessed_dataset(reviews,labels,vocab_size,max_length)





In [9]:
encoded_labels = [1 if label =='positive' else 0 for label in labels]
encoded_labels = np.array(encoded_labels)
X_train,X_temp,Y_train,Y_temp=train_test_split(preprocessed_data,encoded_labels,test_size=0.2,random_state=0)

X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)
print(len(word_to_index))

15001


In [10]:
class custom_dataset(Dataset):
    def __init__(self,reviews,labels):
        self.labels=labels
        self.reviews=reviews

    def __len__(self):
        return len(self.labels)
    def __getitem__(self,idx):
        return torch.LongTensor(self.reviews[idx]),torch.FloatTensor([self.labels[idx]])


## creating the dataloader using the preprocessed data
train_dataset=custom_dataset(X_train,Y_train)
train_loader=DataLoader(train_dataset,batch_size=64,shuffle=False)
val_dataset=custom_dataset(X_val,Y_val)
val_loader=DataLoader(val_dataset,batch_size=64,shuffle=False)




In [20]:


class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, word_embedding_dimension):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, X):
        # print("entered")
        embeddings = self.embedding(X)

        rnn_out, _ = self.rnn(embeddings)
        output = rnn_out[:, -1, :]
        output = self.fc(output)
        return output


class LSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, word_embedding_dimension):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, word_embedding_dimension)
        self.lstm = nn.LSTM(word_embedding_dimension, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, X):
        # print("entered")
        embeddings = self.embedding(X)

        lstm_out, _ = self.lstm(embeddings)
        output = lstm_out[:, -1, :]
        output = self.fc(output)
        return output






In [15]:
def training_model(model,train_loader,val_loader,criterion,optimizer,epochs):
    train_losses=[]
    val_losses=[]
    train_accuracies=[]
    val_accuracies=[]

    for epoch in range(epochs):
        model.train()
        running_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for inputs,labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            # print(outputs)
            # print(labels)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()


            running_train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels.squeeze(1)).sum().item()

        epoch_train_loss = running_train_loss / len(train_loader.dataset)
        epoch_train_accuracy = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_accuracy)

        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0


        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels.squeeze(1)).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_accuracy = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_accuracy)

        print(f'Epoch {epoch+1}/{num_epochs}, '
                f'Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, '
                f'Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}')

        # wandb.log({"train_loss": epoch_train_loss, "train_accuracy": epoch_train_accuracy,
                    # "val_loss": epoch_val_loss, "val_accuracy": epoch_val_accuracy})
    return train_losses, val_losses, train_accuracies, val_accuracies

In [10]:
num_epochs=5
saved=False

if saved==True:
    the_model= RNN(vocab_size+1, hidden_size, output_size,word_embedding_dimension)

    the_model.load_state_dict(torch.load("give your file path",map_location=torch.device("cpu")))

    the_model.eval()

    device=torch.device("cpu")


    input_review=input("Enter the Review:")
    seq=input_review.split()
    # with torch.no_grad():


else:
    model= RNN(vocab_size+1, hidden_size, output_size,word_embedding_dimension)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    if use_cuda:
        model.cuda()

    loss_function=nn.BCEWithLogitsLoss()
    optimizer=optim.Adam(model.parameters(),lr=0.0001)


    print("training started")
    train_losses=[]
    val_losses=[]
    train_accuracies=[]
    val_accuracies=[]
    for epoch in range(num_epochs):
        # print("epoch",epoch,"is running\n")
        model.train()
        running_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for inputs,labels in train_loader:
            inputs=inputs.to(device)
            labels=labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()


            running_train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels.squeeze(1)).sum().item()

        epoch_train_loss = running_train_loss / len(train_loader.dataset)
        epoch_train_accuracy = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_accuracy)

        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0


        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs=inputs.to(device)
                labels=labels.to(device)
                outputs = model(inputs)
                loss = loss_function(outputs, labels)

                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels.squeeze(1)).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_accuracy = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_accuracy)

        print(f'Epoch {epoch+1}/{num_epochs}, '
                f'Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, '
                f'Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}')








training started

Epoch 1/5, Train Loss: 2.1673, Train Acc: 0.5291, Val Loss: 2.1808, Val Acc: 0.5094


Epoch 2/5, Train Loss: 2.1669, Train Acc: 0.5291, Val Loss: 2.1805, Val Acc: 0.5094


Epoch 3/5, Train Loss: 2.1803, Train Acc: 0.5292, Val Loss: 2.2932, Val Acc: 0.5067


Epoch 4/5, Train Loss: 2.1802, Train Acc: 0.5290, Val Loss: 2.3932, Val Acc: 0.5094


Epoch 5/5, Train Loss: 2.1022, Train Acc: 0.5191, Val Loss: 2.6932, Val Acc: 0.5094



Hyperparamerer tuning

In [None]:
# embedd_set = [4,8,16,32,64]
# batch_set = [8,16,32,64,128]
# test_accuracies = []
# for embedd in embedd_set:
#     model =  model= RNN(vocab_size+1, hidden_size, output_size,word_embedding_dimension)



observation:
with increasing batch size accuracy increases and then again decrasing

again with increasing embeding dimenstion observation is same.
    

In [9]:
num_epochs=3
saved=False

if saved==True:
    the_model= LSTM(vocab_size+1, hidden_size, output_size,word_embedding_dimension)

    the_model.load_state_dict(torch.load("give your file path",map_location=torch.device("cpu")))

    the_model.eval()

    device=torch.device("cpu")


    input_review=input("Enter the Review:")
    seq=input_review.split()
    # with torch.no_grad():


else:
    model= LSTM(vocab_size+1, hidden_size, output_size,word_embedding_dimension)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    if use_cuda:
        model.cuda()

    loss_function=nn.BCELoss()
    optimizer=optim.Adam(model.parameters(),lr=0.0001)


    print("training started")
    train_losses=[]
    val_losses=[]
    train_accuracies=[]
    val_accuracies=[]
    for epoch in range(num_epochs):
        model.train()
        running_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for inputs,labels in train_loader:
            inputs=inputs.to(device)
            labels=labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()


            running_train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels.squeeze(1)).sum().item()

        epoch_train_loss = running_train_loss / len(train_loader.dataset)
        epoch_train_accuracy = correct_train / total_train
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_accuracy)

        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0


        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs=inputs.to(device)
                labels=labels.to(device)
                outputs = model(inputs)
                loss = loss_function(outputs, labels)

                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels.squeeze(1)).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_accuracy = correct_val / total_val
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_accuracy)

        print(f'Epoch {epoch+1}/{num_epochs}, '
                f'Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, '
                f'Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}')









training started

Epoch 1/3, Train Loss: 0.0298, Train Acc: 0.6090, Val Loss: 0.0036, Val Acc: 0.5437


Epoch 2/3, Train Loss: 0.0025, Train Acc: 0.6271, Val Loss: 0.0021, Val Acc: 0.5879


Epoch 3/3, Train Loss: 0.0018, Train Acc: 0.6398, Val Loss: 0.0015, Val Acc: 0.5941



Classification accracy on test set:

	RNN 0.5094
	LSTM 0.5941