## Himanshu, MDS202327

#### Importing packages

In [17]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification

from sklearn.metrics import classification_report

#### Preprocessing

In [3]:
df_train = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding="latin1")
df_test = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding="latin1")

# Select necessary columns and remove the null values
df_train = pd.DataFrame(df_train[['text', 'sentiment']])
df_train = df_train.dropna()

df_test = pd.DataFrame(df_test[['text', 'sentiment']])
df_test = df_test.dropna()

In [4]:
df_train.shape, df_test.shape

((27480, 2), (3534, 2))

In [5]:
df_train.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [6]:
df_train['sentiment'].value_counts()

sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64

In [7]:
df_test['sentiment'].value_counts()

sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64

In [8]:
# Mp the labels
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df_train['label'] = df_train['sentiment'].map(label_map)
df_test['label'] = df_test['sentiment'].map(label_map)

#### Custom Data Set

In [9]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

#### Model Building

In [10]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
train_dataset = CustomDataset(df_train['text'].tolist(), df_train['label'].tolist(), tokenizer)
test_dataset = CustomDataset(df_test['text'].tolist(), df_test['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [13]:
epochs = 5
learning_rate = 2e-5
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

#### Training model

In [14]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    total_batches = 0
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        total_batches += 1

        # Print loss for batch (only the multiples of 100)
        if batch_idx %100 == 99:
          print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.6f}")

    avg_loss = total_loss / total_batches
    print()
    # Print average loss for each epoch
    print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_loss:.6f}", end = "\n")
    print(50*'-')

Epoch 1/5, Batch 100/859, Loss: 0.598016
Epoch 1/5, Batch 200/859, Loss: 0.565505
Epoch 1/5, Batch 300/859, Loss: 0.685714
Epoch 1/5, Batch 400/859, Loss: 0.507706
Epoch 1/5, Batch 500/859, Loss: 0.838196
Epoch 1/5, Batch 600/859, Loss: 0.508803
Epoch 1/5, Batch 700/859, Loss: 0.436642
Epoch 1/5, Batch 800/859, Loss: 0.521657

Epoch 1/5, Average Training Loss: 0.584671
--------------------------------------------------
Epoch 2/5, Batch 100/859, Loss: 0.520245
Epoch 2/5, Batch 200/859, Loss: 0.318612
Epoch 2/5, Batch 300/859, Loss: 0.473606
Epoch 2/5, Batch 400/859, Loss: 0.602207
Epoch 2/5, Batch 500/859, Loss: 0.170384
Epoch 2/5, Batch 600/859, Loss: 0.430223
Epoch 2/5, Batch 700/859, Loss: 0.258519
Epoch 2/5, Batch 800/859, Loss: 0.489507

Epoch 2/5, Average Training Loss: 0.421391
--------------------------------------------------
Epoch 3/5, Batch 100/859, Loss: 0.153960
Epoch 3/5, Batch 200/859, Loss: 0.277719
Epoch 3/5, Batch 300/859, Loss: 0.677034
Epoch 3/5, Batch 400/859, Loss:

In [15]:
model.eval()
true_labels = []
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]
        true_labels.extend(labels.tolist())
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        predictions.extend(predicted.tolist())

#### Report

In [16]:
print(classification_report(true_labels, predictions, target_names=['negative', 'neutral', 'positive'], digits = 6))

              precision    recall  f1-score   support

    negative   0.749773  0.826174  0.786122      1001
     neutral   0.769946  0.695105  0.730614      1430
    positive   0.814912  0.842248  0.828355      1103

    accuracy                       0.778155      3534
   macro avg   0.778210  0.787842  0.781697      3534
weighted avg   0.778267  0.778155  0.776842      3534

