In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader,TensorDataset
import torch.optim as optim

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

# Load the dataset

In [None]:
df=pd.read_csv("IMDB Dataset.csv")

In [None]:
df=df.sample(frac =.10)

In [None]:
df.head(2)

# Data Preprocessing

In [None]:
#Lower Case
df["review"]=df["review"].str.lower()

In [None]:
# REMOVE URL's.
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

In [None]:
df["review"] = df["review"].apply(remove_urls)

In [None]:
#REMOVE PUNCTUATIONS AND EMOJI
import re

def remove_punctuations(text):
    text=re.sub(r"[^A-Za-z0-9\s]","",text)
    return text

In [None]:
df["review"] = df["review"].apply(remove_punctuations)

In [None]:
#REMOVE HTML
import re

def remove_html(text):
    text=re.sub(r'<.*?>', '', text)
    return text

In [None]:
df["review"] = df["review"].apply(remove_html)

In [None]:
#REMOVE STOPWORDS

def remove_stopword(text):
    stop_words = stopwords.words('english')  # Specify 'english' for English stopwords
    temp_text = word_tokenize(text)

    for word in temp_text:
        if word in stop_words:
            text=text.replace(word,"")
    return text

In [None]:
df["review"] = df["review"].apply(remove_stopword)

In [None]:

def Stemming(text):
    ps = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_words = []
    for token in tokens:
        stemmed_token = ps.stem(token)
        stemmed_words.append(stemmed_token)
    return ' '.join(stemmed_words)

In [None]:
df["review"] = df["review"].apply(Stemming)

In [None]:
df.head(3)

# **Changing the Target values to categorical value**

In [None]:
df["sentiment"].replace("positive",0,inplace=True)
df["sentiment"].replace("negative",1,inplace=True)

In [None]:
Y=df["sentiment"]

# **Text Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X =tf.fit_transform(df['review']).toarray()

# **Split the dataset**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

In [None]:
X_train.shape

In [None]:
shape=X_train.shape

In [None]:
shape[1]

In [None]:
X_test.shape

In [None]:
type(X_train)

In [None]:
type(Y_train)

In [None]:
Y_train = Y_train.to_numpy()
Y_test = Y_test.to_numpy()

In [None]:
X_train.ndim

# **Create Tensor Datasets**

In [None]:
train_set = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).float())
test_set = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(Y_test).float())

# **Data Loader (Load Data in Batches)**

In [None]:
train_loader = DataLoader(train_set, shuffle=True, batch_size=64)
test_loader = DataLoader(test_set, shuffle=True, batch_size=64)

# **RNN**

In [None]:
class Rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Fully Connected Layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # RNN forward pass
        out, _ = self.rnn(x, h0)

        # Pass through fully connected layer
        out = self.fc(out[:, -1, :])
        return out

# **Hyperparameters**

In [None]:
input_dim = shape[1] # Updated to match TF-IDF feature size
hidden_dim = 128
output_dim = 1  # Binary classification (positive or negative sentiment)
num_layers = 1
num_epochs = 10
batch_size = 64
learning_rate = 0.001

# **Initialize model, criterion, and optimizer**

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Rnn(input_dim, hidden_dim, output_dim, num_layers).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# **Training**

In [None]:
for epoch in range(num_epochs):
    model.train()
    for X_batch, Y_batch in train_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

        # Add an additional dimension for the sequence length
        X_batch = X_batch.unsqueeze(1)

        outputs = model(X_batch)

        # Apply sigmoid activation to get probabilities
        outputs = torch.sigmoid(outputs.squeeze())

        # Compute the loss
        loss = criterion(outputs, Y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


# **EVALUATION**

In [None]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X_batch, Y_batch in test_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

        # Add an additional dimension for the sequence length
        X_batch = X_batch.unsqueeze(1)

        outputs = model(X_batch)
        predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
        total += Y_batch.size(0)
        correct += (predicted == Y_batch).sum().item()

    accuracy = correct / total
    print(f'Accuracy: {accuracy * 100:.2f}%')
