In [5]:
import gensim.downloader as api
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
import pandas as pd

df = pd.read_csv('data/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
# Load the GloVe model
glove_model = api.load("glove-wiki-gigaword-100")



In [19]:
# Preprocess text data
tqdm.pandas(desc="Tokenizing text")
df['text'] = df['review'].apply(lambda x: x.lower().split())

In [101]:
# Create a vocabulary and mapping
vocab = {word for sentence in df['text'] for word in sentence}
word_to_idx = {word: idx + 2 for idx, word in enumerate(vocab)}  # Start indexing from 1
word_to_idx['<PAD>'] = 0  # Add padding token
word_to_idx['<UNK>'] = 1  # Add unknown token

In [102]:
# Create an embedding matrix
embedding_dim = glove_model.vector_size
embedding_matrix = np.zeros((len(word_to_idx), embedding_dim))
for word, idx in word_to_idx.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]

embedding_matrix.shape

(390933, 100)

In [103]:
# Convert text to sequences of indices
df['text_idx'] = df['text'].apply(lambda x: [word_to_idx[word] for word in x])

In [104]:
# Pad sequences
max_len = max(df['text_idx'].apply(len))
df['text_idx'] = df['text_idx'].apply(lambda x: x + [0] * (max_len - len(x)))

In [105]:
# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

In [108]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_idx'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
next(iter(train_loader))

[tensor([[142947,  61335, 264256,  ...,      0,      0,      0],
         [ 68934, 266053, 129857,  ...,      0,      0,      0],
         [204253, 370833, 378210,  ...,      0,      0,      0],
         ...,
         [  6127, 180146, 101969,  ...,      0,      0,      0],
         [181355, 129857,   4373,  ...,      0,      0,      0],
         [ 95061, 331553, 199491,  ...,      0,      0,      0]]),
 tensor([0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
         0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.])]

In [118]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float), padding_idx=0)
        self.rnn1 = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.rnn2 = nn.RNN(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn1(x)
        output, hidden = self.rnn2(output)
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)

In [121]:
# Initialize the model, loss function, and optimizer
vocab_size = len(word_to_idx)
hidden_dim = 128
model = RNNModel(vocab_size, embedding_dim, hidden_dim, embedding_matrix).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [120]:
# Test forward pass
model(torch.tensor([1,1,0], dtype=torch.long).to(device))

tensor([0.5001], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [122]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (texts, labels) in tqdm(enumerate(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", total=len(train_loader)):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f" Loss: {loss.item()}")

Epoch 1/10: 100%|██████████| 1250/1250 [00:18<00:00, 65.86it/s]

 Loss: 0.7202495336532593



Epoch 2/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.15it/s]

 Loss: 0.7355452179908752



Epoch 3/10: 100%|██████████| 1250/1250 [00:18<00:00, 67.04it/s]

 Loss: 0.681577205657959



Epoch 4/10: 100%|██████████| 1250/1250 [00:18<00:00, 67.35it/s]

 Loss: 0.6507362127304077



Epoch 5/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.78it/s]

 Loss: 0.6716914176940918



Epoch 6/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.63it/s]

 Loss: 0.6793402433395386



Epoch 7/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.64it/s]


 Loss: 0.6916049122810364


Epoch 8/10: 100%|██████████| 1250/1250 [00:18<00:00, 67.04it/s]

 Loss: 0.6839642524719238



Epoch 9/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.96it/s]


 Loss: 0.7180426120758057


Epoch 10/10: 100%|██████████| 1250/1250 [00:18<00:00, 66.91it/s]

 Loss: 0.6851561665534973





In [124]:
# Evaluate the model
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in tqdm(test_loader):
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        preds = (outputs.squeeze() > 0.5).float()
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

100%|██████████| 313/313 [00:04<00:00, 77.04it/s]


In [128]:
# Print classification report
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds))

              precision    recall  f1-score   support

         0.0       0.52      0.26      0.35      4961
         1.0       0.51      0.76      0.61      5039

    accuracy                           0.51     10000
   macro avg       0.51      0.51      0.48     10000
weighted avg       0.51      0.51      0.48     10000

