# Amazon Reviews for Sentiment Analysis

In [32]:
import pandas as pd
import numpy as np


import lightning as L
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset, DatasetDict
import torch.nn.functional as F
from torchmetrics import Accuracy, 
from lightning.pytorch.callbacks import Callback, ModelCheckpoint
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from lightning.pytorch.loggers import CSVLogger

In [34]:
# Load train and test datasets
train_dataset = pd.read_csv('train.ft.txt', sep="\t", header=None, names=['text'])
test_dataset = pd.read_csv('test.ft.txt', sep="\t", header=None, names=['text'])

In [35]:
train_dataset.head()

Unnamed: 0,text
0,__label__2 Stuning even for the non-gamer: Thi...
1,__label__2 The best soundtrack ever to anythin...
2,__label__2 Amazing!: This soundtrack is my fav...
3,__label__2 Excellent Soundtrack: I truly like ...
4,"__label__2 Remember, Pull Your Jaw Off The Flo..."


In [36]:
# Create a 'label' column in the training and test datasets with value 0 for '__label__1' and 1 for ' __label__2'
train_dataset['label'] = train_dataset['text'].apply(lambda x : 0 if '__label__1' in x else 1)
test_dataset['label'] = test_dataset['text'].apply(lambda x : 0 if '__label__1' in x else 1)

# Clean the training and test text columns by removing the labels from the beginning
train_dataset['text'] = train_dataset['text'].str.replace(r'__label__[12]', '', regex=True).str.strip()
test_dataset['text'] = test_dataset['text'].str.replace(r'__label__[12]', '', regex=True).str.strip()

In [37]:
train_dataset.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [38]:
print("Class distribution:")
np.bincount(train_dataset["label"].values)

Class distribution:


array([1800000, 1800000])

In [39]:
# Display the minimum, median, and maximum text lengths in the train dataset
text_len = train_dataset["text"].apply(lambda x: len(x.split()))
text_len.min(), text_len.median(), text_len.max()

(np.int64(2), np.float64(70.0), np.int64(257))

In [40]:
# Shuffle the training dataset randomly
train_dataset_shuffled = train_dataset.sample(frac=1, random_state=1).reset_index(drop=True)

# Define validation set size as 20% of the shuffled training dataset
valid_size = int(len(train_dataset_shuffled) * 0.2)

# Split the train (80%) and validation (20%) datasets
valid_dataset = train_dataset_shuffled.iloc[:valid_size]
train_dataset = train_dataset_shuffled.iloc[valid_size:]

In [41]:
# Create a Hugging Face DatasetDict to organize the data into train/validation/test splits
amazon_dataset = DatasetDict({
    "train" : Dataset.from_pandas(train_dataset),
    "validation" : Dataset.from_pandas(valid_dataset),
    "test" : Dataset.from_pandas(test_dataset)
})

print(amazon_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2880000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 720000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 400000
    })
})


In [42]:
# Load the tokenizer for the pretrained DistilBERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [45]:
# Define a function to tokenize text data for model input
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

In [46]:
# Apply the tokenize_text function to the entire DatasetDict
amazon_tokenized = amazon_dataset.map(tokenize_text, batched=True, batch_size=5000)

Map:   0%|          | 0/2880000 [00:00<?, ? examples/s]

Map:   0%|          | 0/720000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [47]:
amazon_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [48]:
# Define a custom PyTorch Dataset
class AmazonDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [88]:
# Wrap the tokenized DatasetDict into PyTorch-compatible Dataset objects
train_dataset = AmazonDataset(amazon_tokenized, partition_key="train")
val_dataset = AmazonDataset(amazon_tokenized, partition_key="validation")
test_dataset = AmazonDataset(amazon_tokenized, partition_key="test")


# Create DataLoaders for the training, validation and test sets
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    shuffle=False)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32
)


In [73]:
# Load a pretrained DistilBERT model for sequence classification
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
# Freeze all parameters of the BERT model
for param in bert_model.parameters():
    param.requires_grad = False

In [75]:
bert_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [76]:
# Unfreeze the parameters of the pre-classifier layer
for param in bert_model.pre_classifier.parameters():
    param.requires_grad = True

# Unfreeze the parameters of the classification head
for param in bert_model.classifier.parameters():
    param.requires_grad = True

In [79]:
# Define a PyTorch Lightning module to wrap the BERT classifier
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()

        self.learning_rate=learning_rate
        self.model = model

        self.save_hyperparameters(ignore=["model"])

        self.train_acc = Accuracy(task = "multiclass", num_classes=2)
        self.val_acc = Accuracy(task = "multiclass", num_classes=2)
        self.test_acc = Accuracy(task = "multiclass", num_classes=2)


    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)


    def _shared_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        return outputs, predicted_labels
    


    def training_step(self, batch, batch_idx):
        outputs, predicted_labels = self._shared_step(batch, batch_idx)
        

        self.log("train_loss", outputs['loss'])

        self.train_acc(predicted_labels, batch["label"])
        self.log("train_acc", self.train_acc, prog_bar=True, on_epoch=True, on_step=False)
        return outputs["loss"]

    def validation_step(self, batch, batch_idx):
        outputs, predicted_labels = self._shared_step(batch, batch_idx)

        self.log("val_loss", outputs["loss"], prog_bar=True)


        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs, predicted_labels = self._shared_step(batch, batch_idx)
        
        self.test_acc(predicted_labels, batch["label"])
        self.log("test_acc", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer 

In [80]:
# Create the LightningModel with the pretrained BERT and a learning rate
lightning_model = LightningModel(model=bert_model, learning_rate=0.01)

callbacks = [ModelCheckpoint(save_top_k=1, mode="max", monitor="val_acc")] # to save the best model

# Set up a CSV logger to save training logs
logger = CSVLogger(save_dir="logs/", name="bert-model")


In [82]:
torch.set_float32_matmul_precision('medium')

In [89]:
L.seed_everything(123)


# Trainer
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator='gpu',
    precision="16-mixed",
    deterministic=True,
    logger=logger
)


# Fit the model
trainer.fit(model=lightning_model, train_dataloaders=train_loader, val_dataloaders=val_loader)

Seed set to 123
Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Seed set to 123
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                                | Params | Mode 
--------------------------------------------------------------------------
0 | model     | DistilBertForSequenceClassification | 67.0 M | eval 
1 | train_acc | MulticlassAccuracy                  | 0      | train
2 | val_acc   | MulticlassAccuracy                  | 0      | train
3 | test_acc  | MulticlassAccuracy                  | 0      | train
--------------------------------------------------------------------------
592 K     Trainable params
66.4 M    Non-trainable params
67.0 M    Total params
267.820   To

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


In [90]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/bert-model\version_5\checkpoints\epoch=2-step=270000.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/bert-model\version_5\checkpoints\epoch=2-step=270000.ckpt
C:\Users\pc\anaconda3\envs\new2\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:433: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_acc': 0.9036549925804138}]