In [1]:
# Import basic libraries to handle data and build the model
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# **Reading Data (Cloning a GitHub Repository)**

In [14]:
!git clone https://github.com/teach65qualcomm/Flipkart-review-sentiment-analysis.git

Cloning into 'Flipkart-review-sentiment-analysis'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 136.33 KiB | 708.00 KiB/s, done.


In [17]:
import pandas as pd

# Read the downloaded CSV file
data = pd.read_csv('/content/Flipkart-review-sentiment-analysis/flipkart_reviews.csv')
data.head()
data.shape

(2304, 3)

In [18]:
# Step 2: Prepare the data
data = data[['review', 'sentiment']]  # Keep only review text and sentiment



In [19]:
data.head()

Unnamed: 0,review,sentiment
0,Best under 60k Great performanceI got it for a...,Positive
1,Good perfomence...,Positive
2,Great performance but usually it has also that...,Positive
3,My wife is so happy and best product 👌🏻😘,Positive
4,"Light weight laptop with new amazing features,...",Positive


In [20]:
# Encode labels (positive = 1, negative = 0)
data["sentiment"].unique()

array(['Positive', 'Negative'], dtype=object)

In [21]:
dic={'Positive':1,'Negative':0}

In [22]:
data["sentiment"]=data["sentiment"].map(dic)

In [24]:
data.head(10)

Unnamed: 0,review,sentiment
0,Best under 60k Great performanceI got it for a...,1
1,Good perfomence...,1
2,Great performance but usually it has also that...,1
3,My wife is so happy and best product 👌🏻😘,1
4,"Light weight laptop with new amazing features,...",1
5,"Amazing laptop, am so much happy, thanks for F...",1
6,Over all a good laptop for personal use,1
7,Thank you so much Flipkart,1
8,Amazing product,1
9,"Good for normal work , students, online classe...",0


In [25]:
# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=100)

In [26]:
# Convert text to sequences using CountVectorizer
vectorizer = CountVectorizer(max_features=2000)  # Use top 2000 words
x_train_seq = vectorizer.fit_transform(x_train).toarray()
x_test_seq = vectorizer.transform(x_test).toarray()

In [32]:
x_train_seq[0:2].shape

(2, 2000)

In [33]:
# Get feature names (words used by the CountVectorizer)
feature_names = vectorizer.get_feature_names_out()

In [34]:
feature_names

array(['10', '100', '1000', ..., 'zoom', 'zoom2', 'zooming'], dtype=object)

In [35]:
x_train_seq1=pd.DataFrame(x_train_seq,columns=feature_names)

In [36]:
x_train_seq1

Unnamed: 0,10,100,1000,10000,108,1080p,108mp,10best,10performance,10x,...,youre,yourself,youtube,yr,yrs,yt,zero,zoom,zoom2,zooming
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1838,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Convert data to PyTorch tensors (needed for training)
x_train_tensor = torch.tensor(x_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [50]:
# Define model parameters
input_size = 2000  # Number of features from CountVectorizer
hidden_size = 128  # Number of hidden units
output_size = 1  # Single neuron for binary classification
num_layers = 1  # Number of layers
batch_size = 32
num_epochs = 5
learning_rate = 0.001

In [41]:
# Initialize weights and biases for RNN
rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
fc_rnn = nn.Sequential(
    nn.Linear(hidden_size, output_size),
    nn.Sigmoid()  # Sigmoid layer for binary output
)

In [51]:
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer_rnn = optim.Adam(list(rnn.parameters()) + list(fc_rnn.parameters()), lr=learning_rate)

In [52]:
# Training loop for RNN
print("Training RNN...")
for epoch in range(num_epochs):
    total_loss = 0
    rnn.train()
    for i in range(0, len(x_train_tensor), batch_size):
        batch_x = x_train_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        batch_y = y_train_tensor[i:i+batch_size].unsqueeze(1)  # Convert to float and make target 2D

        # Initialize hidden states
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        # Forward pass
        rnn_out, _ = rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_rnn(last_hidden_state)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer_rnn.zero_grad()
        loss.backward()
        optimizer_rnn.step()

        total_loss += loss.item()

    print(f"RNN Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Training RNN...
RNN Epoch 1/5, Loss: 25.5817
RNN Epoch 2/5, Loss: 13.7616
RNN Epoch 3/5, Loss: 9.2517
RNN Epoch 4/5, Loss: 7.2606
RNN Epoch 5/5, Loss: 6.1346


In [58]:
# Evaluation for RNN
print("Evaluating RNN...")
rnn.eval()
y_pred_rnn = []
with torch.no_grad():
    for i in range(0, len(x_test_tensor), batch_size):
        batch_x = x_test_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        rnn_out, _ = rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_rnn(last_hidden_state)
        predicted = (outputs.squeeze() > 0.5).long()  # Apply threshold and convert to long
        y_pred_rnn.extend(predicted.tolist())

accuracy_rnn = accuracy_score(y_test, y_pred_rnn)
print(f"RNN Test Accuracy: {accuracy_rnn:.4f}")

Evaluating RNN...
RNN Test Accuracy: 0.9436


In [59]:
from sklearn.metrics import classification_report

In [62]:
print(classification_report(y_test,y_pred_rnn))

              precision    recall  f1-score   support

           0       1.00      0.64      0.78        72
           1       0.94      1.00      0.97       389

    accuracy                           0.94       461
   macro avg       0.97      0.82      0.87       461
weighted avg       0.95      0.94      0.94       461



In [None]:
#Early stopping

# Parameters for early stopping
patience = 2  # Number of epochs to wait for improvement
best_loss = float('inf')  # Initialize best loss as infinity
early_stop_counter = 0  # Counter for patience

# Training loop with early stopping
# Training loop for RNN
print("Training RNN...")
for epoch in range(num_epochs):
    total_loss = 0
    rnn.train()
    for i in range(0, len(x_train_tensor), batch_size):
        batch_x = x_train_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        batch_y = y_train_tensor[i:i+batch_size].float().unsqueeze(1)  # Convert to float and make target 2D

        # Initialize hidden states
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        # Forward pass
        rnn_out, _ = rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_rnn(last_hidden_state)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer_rnn.zero_grad()
        loss.backward()
        optimizer_rnn.step()

        total_loss += loss.item()

    print(f"RNN Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")


     # Early stopping check
    if total_loss < best_loss:
        best_loss = total_loss
        early_stop_counter = 0  # Reset counter if loss improves
        print("Training loss improved, resetting early stopping counter.")
    else:
        early_stop_counter += 1  # Increment counter if no improvement
        print(f"No improvement for {early_stop_counter} epoch(s).")

    if early_stop_counter >= patience:
        print("Early stopping triggered. Stopping training.")
        break

Training RNN...
RNN Epoch 1/20, Loss: 2.8771
Training loss improved, resetting early stopping counter.
RNN Epoch 2/20, Loss: 2.8402
Training loss improved, resetting early stopping counter.
RNN Epoch 3/20, Loss: 2.8075
Training loss improved, resetting early stopping counter.
RNN Epoch 4/20, Loss: 2.7785
Training loss improved, resetting early stopping counter.
RNN Epoch 5/20, Loss: 2.7525
Training loss improved, resetting early stopping counter.
RNN Epoch 6/20, Loss: 2.7292
Training loss improved, resetting early stopping counter.
RNN Epoch 7/20, Loss: 2.7083
Training loss improved, resetting early stopping counter.
RNN Epoch 8/20, Loss: 2.6893
Training loss improved, resetting early stopping counter.
RNN Epoch 9/20, Loss: 2.6722
Training loss improved, resetting early stopping counter.
RNN Epoch 10/20, Loss: 2.6566
Training loss improved, resetting early stopping counter.
RNN Epoch 11/20, Loss: 2.6424
Training loss improved, resetting early stopping counter.
RNN Epoch 12/20, Loss: 2.

In [None]:
# Evaluation for RNN
print("Evaluating RNN...")
rnn.eval()
y_pred_aft = []
with torch.no_grad():
    for i in range(0, len(x_test_tensor), batch_size):
        batch_x = x_test_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        rnn_out, _ = rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_rnn(last_hidden_state)
        predicted = (outputs.squeeze() > 0.5).long()  # Apply threshold and convert to long
        y_pred_aft.extend(predicted.tolist())

accuracy_rnn = accuracy_score(y_test, y_pred_aft)
print(f"RNN after early stopping Test Accuracy: {accuracy_rnn:.4f}")

Evaluating RNN...
RNN after early stopping Test Accuracy: 0.9458


In [None]:
print(classification_report(y_true,y_pred_aft))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81        72
           1       0.95      0.98      0.97       389

    accuracy                           0.95       461
   macro avg       0.92      0.87      0.89       461
weighted avg       0.94      0.95      0.94       461



# **Build LSTM**

In [63]:
# Define model parameters
input_size = 2000  # Number of features from CountVectorizer
hidden_size = 128  # Number of LSTM hidden units
output_size = 1  # Single neuron for binary classification
num_layers = 1  # Number of LSTM layers
batch_size = 32
num_epochs = 20
learning_rate = 0.001

In [64]:
# Initialize weights and biases
lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
fc = nn.Sequential(
    nn.Linear(hidden_size, output_size),
    nn.Sigmoid()  # Sigmoid layer for binary output
)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(list(lstm.parameters()) + list(fc.parameters()), lr=learning_rate)


In [65]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    lstm.train()
    for i in range(0, len(x_train_tensor), batch_size):
        batch_x = x_train_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        batch_y = y_train_tensor[i:i+batch_size].unsqueeze(1)

        # Initialize hidden and cell states
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        # Forward pass
        lstm_out, _ = lstm(batch_x, (h0, c0))
        last_hidden_state = lstm_out[:, -1, :]  # Take the last hidden state
        outputs = fc(last_hidden_state)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")



Epoch 1/20, Loss: 29.6119
Epoch 2/20, Loss: 18.6767
Epoch 3/20, Loss: 11.6003
Epoch 4/20, Loss: 8.5203
Epoch 5/20, Loss: 6.9377
Epoch 6/20, Loss: 5.9149
Epoch 7/20, Loss: 5.2149
Epoch 8/20, Loss: 4.7242
Epoch 9/20, Loss: 4.3634
Epoch 10/20, Loss: 4.0841
Epoch 11/20, Loss: 3.8581
Epoch 12/20, Loss: 3.6687
Epoch 13/20, Loss: 3.5071
Epoch 14/20, Loss: 3.3681
Epoch 15/20, Loss: 3.2478
Epoch 16/20, Loss: 3.1430
Epoch 17/20, Loss: 3.0516
Epoch 18/20, Loss: 2.9716
Epoch 19/20, Loss: 2.9013
Epoch 20/20, Loss: 2.8395


In [67]:
# Evaluation
# Evaluation
lstm.eval()
y_pred_lstm = []
with torch.no_grad():
    for i in range(0, len(x_test_tensor), batch_size):
        batch_x = x_test_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        h0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, batch_x.size(0), hidden_size)

        lstm_out, _ = lstm(batch_x, (h0, c0))
        last_hidden_state = lstm_out[:, -1, :]  # Take the last hidden state
        outputs = fc(last_hidden_state)
        predicted = (outputs.squeeze() > 0.5).long()  # Apply threshold and convert to long
        y_pred_lstm.extend(predicted.tolist())

accuracy = accuracy_score(y_test, y_pred_lstm)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.9479


In [69]:
print(classification_report(y_test,y_pred_lstm))

              precision    recall  f1-score   support

           0       0.91      0.74      0.82        72
           1       0.95      0.99      0.97       389

    accuracy                           0.95       461
   macro avg       0.93      0.86      0.89       461
weighted avg       0.95      0.95      0.95       461



# **Bi-Directional RNN**

In [70]:
# Initialize weights and biases for bidirectional RNN
bidirectional_rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
fc_bidirectional_rnn = nn.Sequential(
    nn.Linear(hidden_size * 2, output_size),  # Multiply by 2 for bidirectional hidden states
    nn.Sigmoid()  # Sigmoid layer for binary output
)

In [71]:
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer_bidirectional_rnn = optim.Adam(
    list(bidirectional_rnn.parameters()) + list(fc_bidirectional_rnn.parameters()), lr=learning_rate
)


In [73]:
# Training loop for Bidirectional RNN
print("Training Bidirectional RNN...")


for epoch in range(num_epochs):
    total_loss = 0
    bidirectional_rnn.train()
    for i in range(0, len(x_train_tensor), batch_size):
        batch_x = x_train_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        batch_y = y_train_tensor[i:i+batch_size].unsqueeze(1)  # Convert to float and make target 2D

        # Initialize hidden states for both directions
        h0 = torch.zeros(num_layers * 2, batch_x.size(0), hidden_size)  # Multiply by 2 for bidirectional

        # Forward pass
        rnn_out, _ = bidirectional_rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_bidirectional_rnn(last_hidden_state)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer_bidirectional_rnn.zero_grad()
        loss.backward()
        optimizer_bidirectional_rnn.step()

        total_loss += loss.item()

    print(f"Bidirectional RNN Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")





Training Bidirectional RNN...
Bidirectional RNN Epoch 1/20, Loss: 2.7435
Bidirectional RNN Epoch 2/20, Loss: 2.7190
Bidirectional RNN Epoch 3/20, Loss: 2.6973
Bidirectional RNN Epoch 4/20, Loss: 2.6782
Bidirectional RNN Epoch 5/20, Loss: 2.6612
Bidirectional RNN Epoch 6/20, Loss: 2.6460
Bidirectional RNN Epoch 7/20, Loss: 2.6323
Bidirectional RNN Epoch 8/20, Loss: 2.6199
Bidirectional RNN Epoch 9/20, Loss: 2.6087
Bidirectional RNN Epoch 10/20, Loss: 2.5984
Bidirectional RNN Epoch 11/20, Loss: 2.5890
Bidirectional RNN Epoch 12/20, Loss: 2.5804
Bidirectional RNN Epoch 13/20, Loss: 2.5724
Bidirectional RNN Epoch 14/20, Loss: 2.5649
Bidirectional RNN Epoch 15/20, Loss: 2.5579
Bidirectional RNN Epoch 16/20, Loss: 2.5514
Bidirectional RNN Epoch 17/20, Loss: 2.5453
Bidirectional RNN Epoch 18/20, Loss: 2.5395
Bidirectional RNN Epoch 19/20, Loss: 2.5341
Bidirectional RNN Epoch 20/20, Loss: 2.5289


In [74]:
# Evaluation for Bidirectional RNN
print("Evaluating Bidirectional RNN...")
bidirectional_rnn.eval()
y_pred_bidirectional_rnn = []
with torch.no_grad():
    for i in range(0, len(x_test_tensor), batch_size):
        batch_x = x_test_tensor[i:i+batch_size].unsqueeze(1)  # Add sequence dimension
        h0 = torch.zeros(num_layers * 2, batch_x.size(0), hidden_size)  # Multiply by 2 for bidirectional

        rnn_out, _ = bidirectional_rnn(batch_x, h0)
        last_hidden_state = rnn_out[:, -1, :]  # Take the last hidden state
        outputs = fc_bidirectional_rnn(last_hidden_state)
        predicted = (outputs.squeeze() > 0.5).long()  # Apply threshold and convert to long
        y_pred_bidirectional_rnn.extend(predicted.tolist())

accuracy_bidirectional_rnn = accuracy_score(y_test, y_pred_bidirectional_rnn)
print(f"Bidirectional RNN Test Accuracy: {accuracy_bidirectional_rnn:.4f}")

Evaluating Bidirectional RNN...
Bidirectional RNN Test Accuracy: 0.9436
