# Import Libraries

In [None]:
import re
import time
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Load Data

In [None]:
df = pd.read_csv("/mnt/d/Datasets/movie.csv")
df.head()

In [None]:
df.label.value_counts()

# Preprocess

In [None]:
stop_words = [x.strip() for x in open('/mnt/d/Datasets/SmartStoplist.txt','r').read().split('\n')]

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text.split()

In [None]:
df['cleaned'] = df['text'].apply(preprocess_text)

In [None]:
data = [(row['cleaned'], row['label']) for _, row in df.iterrows()]

In [None]:
def load_glove_embeddings(file_path):
    word_to_index = {"<OOV>": 0}
    index_to_word = {0: "<OOV>"}
    embeddings = []

    embeddings.append(np.zeros(300))
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            index = len(word_to_index)
            word_to_index[word] = index
            index_to_word[index] = word
            embeddings.append(vector)
    
    return word_to_index, index_to_word, torch.tensor(np.array(embeddings), dtype=torch.float)

In [None]:
glove_path = "/mnt/d/Datasets/glove.6B.300d.txt"

In [None]:
word_to_index, index_to_word, weights_matrix = load_glove_embeddings(glove_path)

# DataLoaders

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, texts, word_to_index):
        self.texts = texts
        self.word_to_index = word_to_index

    def __getitem__(self, index):
        text, label = self.texts[index]
        vectorized = [self.word_to_index.get(word, self.word_to_index["<OOV>"]) for word in text]
        return torch.tensor(vectorized, dtype=torch.long), label

    def __len__(self):
        return len(self.texts)

In [None]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
train_dataset = IMDBDataset(train_data, word_to_index)
test_dataset = IMDBDataset(test_data, word_to_index)

In [None]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    max_len = max(lengths)
    padded_texts = torch.zeros(len(texts), max_len, dtype=torch.long)

    for i, text in enumerate(texts):
        padded_texts[i, :len(text)] = text

    return padded_texts, torch.tensor(lengths, dtype=torch.float), torch.tensor(labels, dtype=torch.long)

In [None]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model

In [None]:
class DeepAveragingNetwork(nn.Module):
    def __init__(self, num_classes, weights_matrix):
        super(DeepAveragingNetwork, self).__init__()
        
        self.embedding_layer = nn.Embedding(weights_matrix.size(0), weights_matrix.size(1))
        self.embedding_layer.load_state_dict({'weight': weights_matrix})
        self.embedding_layer.weight.requires_grad = False
        
        self.model = nn.Sequential(
            nn.Linear(weights_matrix.size(1), 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, inputs, inputs_len):
        embed = self.embedding_layer(inputs)
        x = embed.sum(dim=1) / inputs_len.view(-1, 1)
        return self.model(x)

In [None]:
model = DeepAveragingNetwork(num_classes=2, weights_matrix=weights_matrix)
model.to(device)

# Train

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 25

In [None]:
losses = [] 

In [None]:
for epoch in range(epochs):
    curr_loss = 0.0
    model.train()
    
    for texts, lengths, labels in train_dataloader:
        texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(texts, lengths)
        
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        curr_loss += loss.item()
    
    avg_loss = curr_loss / len(train_dataloader)
    losses.append(avg_loss)
    print(f"Epoch [{epoch + 1}/{epochs}], Average Loss: {avg_loss:.3f}")

In [None]:
plt.figure(figsize=(8, 6))
plt.title("Loss Curve")
plt.plot(range(epochs), losses, marker="o")
plt.show()

# Test

In [None]:
classes = ["negative", "positive"]
correct_pred = {class_name: 0 for class_name in classes}
total_pred = {class_name: 0 for class_name in classes}

In [None]:
with torch.no_grad():
    model.eval()

    for texts, lengths, labels in test_dataloader:
        texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)

        outputs = model(texts, lengths)
        _, predictions = torch.max(outputs, 1)
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1

            total_pred[classes[label]] += 1

In [None]:
for class_name, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[class_name]
    print(f"Accuracy for class: {class_name} is {accuracy:.2f}%")

In [None]:
total_accuracy = 100 * float(np.sum(list(correct_pred.values())) / np.sum(list(total_pred.values())))
print(f'Total Accuracy: {total_accuracy:.2f} %')