**Import Libraries**

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


device(type='cpu')

**Load and Explore Dataset**

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

print(f"Total samples: {len(df)}")
print(df['sentiment'].value_counts())


Total samples: 50000
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


**Data Preprocessing**

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text)
    return text.strip()

stop_words = set(stopwords.words('english'))

def tokenize(text):
    words = [w for w in text.split() if w not in stop_words]
    return words

df['review'] = df['review'].apply(preprocess_text)
df['tokens'] = df['review'].apply(tokenize)


**Vocabulary and Encoding**

In [4]:
from collections import Counter

# Build vocabulary
all_words = [word for tokens in df['tokens'] for word in tokens]
word_counts = Counter(all_words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_to_int = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 to reserve 0 for padding

def encode_tokens(tokens):
    return [vocab_to_int[word] for word in tokens if word in vocab_to_int]

df['encoded'] = df['tokens'].apply(encode_tokens)


**Padding**

In [5]:
def pad_features(reviews, seq_length=500):
    features = np.zeros((len(reviews), seq_length), dtype=int)
    for i, row in enumerate(reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]
    return features

features = pad_features(df['encoded'], 500)
labels = np.array([1 if label == 'positive' else 0 for label in df['sentiment']])


**Train/Test Split**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

batch_size = 64
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)


 **Simple RNN**

In [7]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim=1):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, hidden = self.rnn(x)
        out = self.fc(out[:, -1])
        return self.sig(out)


**LSTM**

In [8]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim=1):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, (hidden, cell) = self.lstm(x)
        out = self.fc(out[:, -1])
        return self.sig(out)


**Training Function**

In [9]:
def train_model(model, train_loader, criterion, optimizer, epochs=3):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device).float()
            optimizer.zero_grad()
            output = model(inputs).squeeze()
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


**Evaluation Function**

In [10]:
def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device).float()
            output = model(inputs).squeeze()
            preds = torch.round(output)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    print(f"Test Accuracy: {100 * correct / total:.2f}%")


**Train Both Models**

In [None]:
vocab_size = len(vocab_to_int) + 1
embed_dim = 128
hidden_dim = 128

# RNN
rnn_model = SentimentRNN(vocab_size, embed_dim, hidden_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=0.001)
train_model(rnn_model, train_loader, criterion, optimizer, epochs=3)
evaluate_model(rnn_model, test_loader)

# LSTM
lstm_model = SentimentLSTM(vocab_size, embed_dim, hidden_dim)
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
train_model(lstm_model, train_loader, criterion, optimizer, epochs=3)
evaluate_model(lstm_model, test_loader)


Epoch 1/3, Loss: 0.6195
Epoch 2/3, Loss: 0.5442
Epoch 3/3, Loss: 0.4579
Test Accuracy: 78.60%


**Predict New Reviews**

In [None]:
def predict_review(model, text):
    model.eval()
    tokens = tokenize(preprocess_text(text))
    encoded = encode_tokens(tokens)
    padded = pad_features([encoded])
    input_tensor = torch.from_numpy(padded).to(device)
    with torch.no_grad():
        output = model(input_tensor)
    pred = torch.round(output).item()
    return "Positive" if pred == 1 else "Negative"

sample = "The movie was really touching and well-acted!"
print(predict_review(rnn_model, sample))
