## **Amazon Kindle Book Review Sentiment Analysis**

- Team 15
- Sentiment analysis on Amazon Kindle Book reviews
- [Data Set](https://www.kaggle.com/datasets/meetnagadia/amazon-kindle-book-review-for-sentiment-analysis)



In [1]:
# Install required libraries
# !pip install transformers torch

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# 1. Load and Preprocess the Dataset

data = pd.read_csv('all_kindle_review.csv')

In [4]:
# Keep only relevant columns
data = data[['reviewText', 'rating']]

In [5]:
data

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4
...,...,...
11995,Valentine cupid is a vampire- Jena and Ian ano...,4
11996,I have read all seven books in this series. Ap...,5
11997,This book really just wasn't my cuppa. The si...,3
11998,"tried to use it to charge my kindle, it didn't...",1


In [6]:
# Separate the reviews and labels
reviews = data['reviewText'].fillna("")  # Fill NaN values with empty strings
labels = data['rating']

In [7]:
# Convert ratings to binary labels: 1 for positive (3, 4, 5), 0 for negative (1, 2)
labels = labels.apply(lambda x: 1 if x in [3, 4, 5] else 0)

In [8]:
# Train-test split
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(reviews, labels, test_size=0.2, random_state=42)
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=0.5, random_state=42)

In [9]:
# 2. Tokenize the data with a Pre-trained Tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [10]:
# Tokenize the datasets
def tokenize_data(texts, labels, max_length=128):
    encoded = tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return TensorDataset(encoded["input_ids"], encoded["attention_mask"], torch.tensor(labels.values))

In [11]:
train_dataset = tokenize_data(train_x, train_y)
val_dataset = tokenize_data(val_x, val_y)
test_dataset = tokenize_data(test_x, test_y)

In [12]:
# DataLoader for batching
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
# 3. Load the Pre-trained Model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# 4. Define Optimizer and Loss Function

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

In [15]:
# 5. Training and Validation Loops

def train_model(model, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {total_loss / len(train_loader)}")
        validate_model(model, val_loader)

def validate_model(model, val_loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    print(f"Validation Accuracy: {100 * correct / total:.2f}%")

In [16]:
# Train the model
train_model(model, train_loader, val_loader, epochs=4)

Epoch 1/4, Training Loss: 0.3444584746907155
Validation Accuracy: 88.08%
Epoch 2/4, Training Loss: 0.20598705117901167
Validation Accuracy: 86.92%
Epoch 3/4, Training Loss: 0.10953348780574743
Validation Accuracy: 86.58%
Epoch 4/4, Training Loss: 0.06920112775425272
Validation Accuracy: 85.83%


In [17]:
# 6. Test the Model
def test_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    test_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            test_loss += criterion(outputs.logits, labels).item()

    print(f"Test Loss: {test_loss / len(test_loader):.4f}")
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

# Evaluate on test data
test_model(model, test_loader)

Test Loss: 0.4700
Test Accuracy: 88.33%
