In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [4]:
import nltk

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [7]:
import re
import string
from nltk.corpus import stopwords

# Ensure you have the stopwords downloaded
import nltk
# 1. Text Cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    text = text.strip()
    return text

df['cleaned_text'] = df['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# 4. Frequency-Based Vectors (CountVectorizer)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_counts = vectorizer.transform(test_df['cleaned_text'])

In [11]:
# 5. Word Vectors (Word2Vec)
tokenized_corpus = [word_tokenize(text) for text in train_df['cleaned_text']]
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)



(46377323, 48637980)

In [12]:
# Function to convert a text to a Word2Vec vector
def text_to_w2v(text, model, vector_size):
    words = word_tokenize(text)
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return torch.zeros(vector_size)
    return torch.tensor(sum(word_vecs) / len(word_vecs))

train_w2v = torch.stack([text_to_w2v(text, w2v_model, 100) for text in train_df['cleaned_text']])
test_w2v = torch.stack([text_to_w2v(text, w2v_model, 100) for text in test_df['cleaned_text']])

In [13]:
# 6. Custom Dataset for DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(train_w2v, torch.tensor(train_df['label'].values))
test_dataset = TextDataset(test_w2v, torch.tensor(test_df['label'].values))

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [14]:
import numpy as np

def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_file_path = '/content/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)

# Create an embedding matrix
embedding_dim = 100
vocab_size = len(vectorizer.vocabulary_)
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))

for word, i in vectorizer.vocabulary_.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [15]:
class IMDBDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vectorizer, max_len=100):
        self.tokenized_texts = tokenized_texts
        self.labels = labels
        self.vectorizer = vectorizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        tokens = self.tokenized_texts[idx]
        # Convert tokens to their corresponding indices using the vectorizer's vocabulary
        indices = [self.vectorizer.vocabulary_.get(token, 0) for token in tokens]
        # Pad or truncate sequences to the max length
        if len(indices) > self.max_len:
            indices = indices[:self.max_len]
        else:
            indices = indices + [0] * (self.max_len - len(indices))

        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Reuse the existing tokenized corpus and labels
train_dataset = IMDBDataset(tokenized_corpus, train_df['label'].values, vectorizer)
test_tokenized_corpus = [word_tokenize(text) for text in test_df['cleaned_text']]
test_dataset = IMDBDataset(test_tokenized_corpus, test_df['label'].values, vectorizer)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Training a Model using GloVe Embeddings with Vanilla RNN

In [17]:
import torch

# Check for GPU and use it if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers=1):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden[-1])
        return out

# Initialize the RNN model with GloVe embeddings
hidden_dim = 128
output_dim = 2  # Positive or Negative
rnn_model = RNNClassifier(embedding_matrix, hidden_dim, output_dim)

# Move the model to the GPU
rnn_model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

# Training the RNN model
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            # Move inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

train_model(rnn_model, criterion, optimizer, train_loader)


Using device: cuda
Epoch 1/10, Loss: 0.6838019123792648
Epoch 2/10, Loss: 0.6853447044372558
Epoch 3/10, Loss: 0.6501961549520493
Epoch 4/10, Loss: 0.6092361849546433
Epoch 5/10, Loss: 0.594579152417183
Epoch 6/10, Loss: 0.5422507105827331
Epoch 7/10, Loss: 0.512430697774887
Epoch 8/10, Loss: 0.4693398876309395
Epoch 9/10, Loss: 0.4523708668589592
Epoch 10/10, Loss: 0.4720785665273666


Training a Model using GloVe Embeddings with LSTM

In [18]:
import torch

# Check for GPU and use it if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, n_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return out

# Initialize the LSTM model with GloVe embeddings
hidden_dim = 128
output_dim = 2  # Positive or Negative
lstm_model = LSTMClassifier(embedding_matrix, hidden_dim, output_dim)

# Move the model to the GPU
lstm_model.to(device)

# Define loss function and optimizer for LSTM
lstm_criterion = nn.CrossEntropyLoss()
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training the LSTM model
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            # Move inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

train_model(lstm_model, lstm_criterion, lstm_optimizer, train_loader)


Using device: cuda
Epoch 1/10, Loss: 0.5901321661949158
Epoch 2/10, Loss: 0.28092729479670525
Epoch 3/10, Loss: 0.11972991601675749
Epoch 4/10, Loss: 0.038710661563975735
Epoch 5/10, Loss: 0.01279011968499981
Epoch 6/10, Loss: 0.005521329668955878
Epoch 7/10, Loss: 0.002706246008859307
Epoch 8/10, Loss: 0.0015360830800178519
Epoch 9/10, Loss: 0.0016554665244926583
Epoch 10/10, Loss: 0.0006381060961397452


Training Models using On-the-Fly Embeddings with Vanilla RNN and LSTM

In [19]:
import torch

# Check for GPU and use it if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class OnTheFlyRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super(OnTheFlyRNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden[-1])
        return out

# Initialize the RNN model with on-the-fly embeddings
hidden_dim = 128
output_dim = 2  # Positive or Negative
on_the_fly_rnn_model = OnTheFlyRNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Move the model to the GPU
on_the_fly_rnn_model.to(device)

# Define loss function and optimizer
on_the_fly_rnn_criterion = nn.CrossEntropyLoss()
on_the_fly_rnn_optimizer = optim.Adam(on_the_fly_rnn_model.parameters(), lr=0.001)

# Training the RNN model
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            # Move inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

train_model(on_the_fly_rnn_model, on_the_fly_rnn_criterion, on_the_fly_rnn_optimizer, train_loader)


Using device: cuda
Epoch 1/10, Loss: 0.6987871993541718
Epoch 2/10, Loss: 0.6943235644817353
Epoch 3/10, Loss: 0.6839151214122772
Epoch 4/10, Loss: 0.6513142557621002
Epoch 5/10, Loss: 0.6287366610527039
Epoch 6/10, Loss: 0.6362035749673843
Epoch 7/10, Loss: 0.5885704596042634
Epoch 8/10, Loss: 0.5670207745790482
Epoch 9/10, Loss: 0.5839957120895386
Epoch 10/10, Loss: 0.5346727333784104


On-the-Fly Embeddings with LSTM

In [20]:
import torch

# Check for GPU and use it if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class OnTheFlyLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super(OnTheFlyLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return out

# Initialize the LSTM model with on-the-fly embeddings
hidden_dim = 128
output_dim = 2  # Positive or Negative
on_the_fly_lstm_model = OnTheFlyLSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Move the model to the GPU
on_the_fly_lstm_model.to(device)

# Define loss function and optimizer
on_the_fly_lstm_criterion = nn.CrossEntropyLoss()
on_the_fly_lstm_optimizer = optim.Adam(on_the_fly_lstm_model.parameters(), lr=0.001)

# Training the LSTM model
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            # Move inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

train_model(on_the_fly_lstm_model, on_the_fly_lstm_criterion, on_the_fly_lstm_optimizer, train_loader)


Using device: cuda
Epoch 1/10, Loss: 0.6751812165260315
Epoch 2/10, Loss: 0.4971581143260002
Epoch 3/10, Loss: 0.2973037529051304
Epoch 4/10, Loss: 0.1981917773529887
Epoch 5/10, Loss: 0.11875437894165516
Epoch 6/10, Loss: 0.06724135623089969
Epoch 7/10, Loss: 0.03911319843754172
Epoch 8/10, Loss: 0.025700910528050736
Epoch 9/10, Loss: 0.01734880520249717
Epoch 10/10, Loss: 0.011303001155995298


Testing and Evaluation

In [22]:
def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for inputs, labels in test_loader:
            # Move inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}%')
    return accuracy

# Evaluate each model on the test dataset
print("Evaluating RNN Model with GloVe Embeddings")
evaluate_model(rnn_model, test_loader)

print("Evaluating LSTM Model with GloVe Embeddings")
evaluate_model(lstm_model, test_loader)

print("Evaluating On-the-Fly RNN Model")
evaluate_model(on_the_fly_rnn_model, test_loader)

print("Evaluating On-the-Fly LSTM Model")
evaluate_model(on_the_fly_lstm_model, test_loader)



Evaluating RNN Model with GloVe Embeddings
Accuracy: 58.46%
Evaluating LSTM Model with GloVe Embeddings
Accuracy: 84.24%
Evaluating On-the-Fly RNN Model
Accuracy: 57.52%
Evaluating On-the-Fly LSTM Model
Accuracy: 85.21%


85.21