In [1]:
import zipfile
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [2]:
zip_file = 'comments2k.zip'   # Adding our zip file  to extract the information

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('/content')


In [3]:

# Downloading stopwords like “the”, “a”, “an”, “so”, “what”.
#nltk.download('stopwords') #downloading them from Natural Language Toolkit
#stop_words = set(stopwords.words('english'))

#def clean_text(text):
    # Convert text to lowercase
    #text = text.lower()
    # Remove URLs
    # Remove punctuation
    #text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    #text = re.sub(r'\d+', '', text)
    # Remove stopwords
    #text = ' '.join(word for word in text.split() if word not in stop_words)
    # Remove extra whitespaces
    #text = ' '.join(text.split())
    #return text

def load_comments(dir_path):
    comments = []
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                comments.extend(f.readlines())
    return comments

pos_dir = '/content/comments1k_pos'
neg_dir = '/content/comments1k_neg'

positive_comments = load_comments(pos_dir)
negative_comments = load_comments(neg_dir)

#positive_comments = [clean_text(comment) for comment in positive_comments]
#negative_comments = [clean_text(comment) for comment in negative_comments]

texts = positive_comments + negative_comments
labels = ['Positive'] * len(positive_comments) + ['Negative'] * len(negative_comments)

data = pd.DataFrame({'text': texts, 'sentiment': labels})


In [4]:
data

Unnamed: 0,text,sentiment
0,This movie is so misunderstood it is not even ...,Positive
1,"Okay, first of all I got this movie as a Chris...",Positive
2,"The performances were superb, the costumes del...",Positive
3,I've been trying to find out about this series...,Positive
4,yeah right. Sammo Hung already acted in the ma...,Positive
...,...,...
1995,It occurs to me that some of the films that ha...,Negative
1996,"Firstly, there are some good things about this...",Negative
1997,"Garson Kanin wrote and directed this look at ""...",Negative
1998,This movie is one of the worst movies I have e...,Negative


In [5]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Define and fit the tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Tokenize and convert texts to sequences
train_sequences = [torch.tensor([vocab[token] for token in tokenizer(text.lower())]) for text in train_texts]
val_sequences = [torch.tensor([vocab[token] for token in tokenizer(text.lower())]) for text in val_texts]
test_sequences = [torch.tensor([vocab[token] for token in tokenizer(text.lower())]) for text in test_texts]

# Pad the sequences
def pad_sequences(sequences, max_len):
    return pad_sequence(sequences, batch_first=True, padding_value=0)

max_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences), max(len(seq) for seq in test_sequences))

train_padded = pad_sequences(train_sequences, max_len)
val_padded = pad_sequences(val_sequences, max_len)
test_padded = pad_sequences(test_sequences, max_len)

# Define the vocabulary size
vocab_size = len(vocab)+1

print("Vocabulary size:", vocab_size)
print("Train padded shape:", train_padded.shape)
print("Validation padded shape:", val_padded.shape)
print("Test padded shape:", test_padded.shape)

Vocabulary size: 20745
Train padded shape: torch.Size([1400, 1212])
Validation padded shape: torch.Size([300, 1149])
Test padded shape: torch.Size([300, 1469])


In [6]:
# Encode the labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Convert labels to torch tensors
train_labels_tensor = torch.tensor(train_labels_encoded, dtype=torch.long)
val_labels_tensor = torch.tensor(val_labels_encoded, dtype=torch.long)
test_labels_tensor = torch.tensor(test_labels_encoded, dtype=torch.long)

# Create TensorDataset
train_data = TensorDataset(train_padded, train_labels_tensor)
val_data = TensorDataset(val_padded, val_labels_tensor)
test_data = TensorDataset(test_padded, test_labels_tensor)

# Create DataLoader
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Hyperparameters
embedding_dim = 200  # Dimension of word embeddings
hidden_size = 256
num_classes = 2
learning_rate = 0.001
num_epochs = 20

In [7]:
# Simple Neural Network Model
class SimpleNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(SimpleNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)  # Average embeddings over the sequence
        out = self.fc1(embedded)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(embedded, h0)
        out = self.fc(out[:, -1, :])
        return out

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(1, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(embedded, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# GRU Model
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.gru.hidden_size).to(x.device)
        out, _ = self.gru(embedded, h0)
        out = self.fc(out[:, -1, :])
        return out

class BiLSTM3Layers(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, dropout=0.2):
        super(BiLSTM3Layers, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=3, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional LSTM

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Concatenate the final hidden states from both directions
        lstm_out = torch.cat((lstm_out[:, -1, :hidden_size], lstm_out[:, 0, hidden_size:]), dim=1)
        output = self.fc(lstm_out)
        return output



In [None]:


def train_and_evaluate(model, train_loader, val_loader, test_loader, num_epochs, learning_rate):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        train_losses.append(train_loss / len(train_loader.dataset))
        train_accuracy = train_correct / train_total
        train_accuracies.append(train_accuracy)

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_losses.append(val_loss / len(val_loader.dataset))
        val_accuracy = val_correct / val_total
        val_accuracies.append(val_accuracy)

        print(f'Model: {type(model).__name__}, Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracy:.4f}')

    time_cost = time.time() - start_time

    # Further evaluation on validation and test sets
    model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            val_preds.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_report = classification_report(val_labels, val_preds, target_names=['Negative', 'Positive'])

    test_preds = []
    test_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_preds.extend(predicted.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())
    test_report = classification_report(test_labels, test_preds, target_names=['Negative', 'Positive'])


    return train_losses, val_losses, train_accuracies, val_accuracies, val_report, test_report, time_cost


simple_nn = SimpleNNModel(vocab_size, embedding_dim, hidden_size, num_classes)
simple_nn_train_losses, simple_nn_val_losses, simple_nn_train_accuracies, simple_nn_val_accuracies, simple_nn_val_report, simple_nn_test_report, simple_nn_time_cost = train_and_evaluate(simple_nn, train_loader, val_loader, test_loader, num_epochs, learning_rate)

print("Simple NN Classification Report (Validation):")
print(simple_nn_val_report)
print("Simple NN Classification Report (Test):")
print(simple_nn_test_report)
print(f"Simple NN Time Cost: {simple_nn_time_cost:.2f} seconds")
print("="*60)
# Train and evaluate RNN
rnn = RNNModel(vocab_size, embedding_dim, hidden_size, num_classes)
rnn_train_losses, rnn_val_losses, rnn_train_accuracies, rnn_val_accuracies, rnn_val_report, rnn_test_report, rnn_time_cost = train_and_evaluate(rnn, train_loader, val_loader, test_loader, num_epochs, learning_rate)

print("RNN Classification Report (Validation):")
print(rnn_val_report)
print("RNN Classification Report (Test):")
print(rnn_test_report)
print(f"RNN Time Cost: {rnn_time_cost:.2f} seconds")
print("="*60)
# Train and evaluate LSTM
lstm = LSTMModel(vocab_size, embedding_dim, hidden_size, num_classes)
lstm_train_losses, lstm_val_losses, lstm_train_accuracies, lstm_val_accuracies, lstm_val_report, lstm_test_report, lstm_time_cost = train_and_evaluate(lstm, train_loader, val_loader, test_loader, num_epochs, learning_rate)

print("LSTM Classification Report (Validation):")
print(lstm_val_report)
print("LSTM Classification Report (Test):")
print(lstm_test_report)
print(f"LSTM Time Cost: {lstm_time_cost:.2f} seconds")
print("="*60)
# Train and evaluate GRU
gru = GRUModel(vocab_size, embedding_dim, hidden_size, num_classes)
gru_train_losses, gru_val_losses, gru_train_accuracies, gru_val_accuracies, gru_val_report, gru_test_report, gru_time_cost = train_and_evaluate(gru, train_loader, val_loader, test_loader, num_epochs, learning_rate)

print("GRU Classification Report (Validation):")
print(gru_val_report)
print("GRU Classification Report (Test):")
print(gru_test_report)
print(f"GRU Time Cost: {gru_time_cost:.2f} seconds")
print("="*60)
biLSTM = BiLSTM3Layers(vocab_size, embedding_dim, hidden_size, num_classes,dropout=0.2)
biLSTM_train_losses, biLSTM_val_losses, biLSTM_train_accuracies, biLSTM_val_accuracies, biLSTM_val_report, biLSTM_test_report, biLSTM_time_cost = train_and_evaluate(biLSTM, train_loader, val_loader, test_loader, num_epochs, learning_rate)

print("biLSTM Classification Report (Validation):")
print(biLSTM_val_report)
print("biLSTM Classification Report (Test):")
print(biLSTM_test_report)
print(f"biLSTM Time Cost: {biLSTM_time_cost:.2f} seconds")



Model: SimpleNNModel, Epoch 1/20, Train Loss: 0.7107, Train Acc: 0.4986, Val Loss: 0.6970, Val Acc: 0.5000
Model: SimpleNNModel, Epoch 2/20, Train Loss: 0.6956, Train Acc: 0.5193, Val Loss: 0.6963, Val Acc: 0.4933
Model: SimpleNNModel, Epoch 3/20, Train Loss: 0.6992, Train Acc: 0.5086, Val Loss: 0.6921, Val Acc: 0.5033
Model: SimpleNNModel, Epoch 4/20, Train Loss: 0.6939, Train Acc: 0.5271, Val Loss: 0.6952, Val Acc: 0.5033
Model: SimpleNNModel, Epoch 5/20, Train Loss: 0.6914, Train Acc: 0.5257, Val Loss: 0.6926, Val Acc: 0.4900
Model: SimpleNNModel, Epoch 6/20, Train Loss: 0.6914, Train Acc: 0.5279, Val Loss: 0.6982, Val Acc: 0.4900
Model: SimpleNNModel, Epoch 7/20, Train Loss: 0.6925, Train Acc: 0.5057, Val Loss: 0.6902, Val Acc: 0.5267
Model: SimpleNNModel, Epoch 8/20, Train Loss: 0.6865, Train Acc: 0.5429, Val Loss: 0.6984, Val Acc: 0.4900
Model: SimpleNNModel, Epoch 9/20, Train Loss: 0.6811, Train Acc: 0.5421, Val Loss: 0.6861, Val Acc: 0.5000
Model: SimpleNNModel, Epoch 10/20, Tr