# Sentiment Analysis with Deep Learning using BERT

## Exploratory Data Analysis and Preprocessing

In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-team/Text-Datasets/refs/heads/main/Reddit_Data.csv')
df.dropna(inplace=True)
df.index.name = 'id'

In [3]:
df.rename(columns={'clean_comment':'text'},inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [5]:
df.text.iloc[0]

' family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches '

In [6]:
df.category.value_counts()

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [7]:
df.category.value_counts()

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [8]:
possible_labels = df.category.unique()

In [9]:
possible_labels.sort()

In [10]:
# -1:0, 0:1, 1:2
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [11]:
df['label'] = df.category.replace(label_dict)

In [12]:
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,family mormon have never tried explain them t...,1,2
1,buddhism has very much lot compatible with chr...,1,2
2,seriously don say thing first all they won get...,-1,0
3,what you have learned yours and only yours wha...,0,1
4,for your own benefit you may want read living ...,1,2


In [13]:
df['label'].value_counts()

label
2    15830
1    13042
0     8277
Name: count, dtype: int64

## Training/Validation Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size = 0.15, random_state=17, stratify = df.label.values)

In [16]:
df['data_type'] = ['not_set']*df.shape[0]

In [17]:
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,family mormon have never tried explain them t...,1,2,not_set
1,buddhism has very much lot compatible with chr...,1,2,not_set
2,seriously don say thing first all they won get...,-1,0,not_set
3,what you have learned yours and only yours wha...,0,1,not_set
4,for your own benefit you may want read living ...,1,2,not_set


In [18]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [19]:
df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
-1,0,train,7035
-1,0,val,1242
0,1,train,11086
0,1,val,1956
1,2,train,13455
1,2,val,2375


##  Loading Tokenizer and Encoding our Data

In [20]:
# !pip install transformers

In [21]:
import sys
print(sys.executable)


/opt/conda/bin/python


In [22]:
# !python -m pip install transformers

In [23]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [25]:
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type=='train'].text.values, add_special_tokens=True,return_attention_mask=True,pad_to_max_length=True,max_length=256,return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [26]:
encoded_data_val= tokenizer.batch_encode_plus(df[df.data_type=='val'].text.values, add_special_tokens=True,return_attention_mask=True,pad_to_max_length=True,max_length=256,return_tensors='pt')

In [27]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

In [28]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val= encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [29]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [30]:
len(dataset_train)

31576

In [31]:
len(dataset_val)

5573

##  Setting up BERT Pretrained Model

In [32]:
from transformers import BertForSequenceClassification

In [33]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(label_dict), output_attentions=False,output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##  Creating Data Loaders

In [34]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [35]:
batch_size = 16 #32

dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=batch_size)

In [36]:

dataloader_val = DataLoader(dataset_val,sampler=RandomSampler(dataset_val),batch_size=32)

##  Setting Up Optimizer and Scheduler

In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [38]:
optimizer = AdamW(model.parameters(),lr=1e-5, #2e-5 > 5e-5
                 eps=1e-8)



In [39]:
epochs=10

scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=len(dataloader_train)*epochs)

##  Defining our Performance Metrics

In [40]:
import numpy as np
from sklearn.metrics import f1_score

In [41]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [42]:
def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k,v in label_dict.items()}

    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {labels_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Creating our Training Loop

In [43]:
import random

seed_val = 8
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [44]:
# !python -m pip install --upgrade pip
# %pip install torch --index-url https://download.pytorch.org/whl/cu125


In [45]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [46]:
import torch
print(torch.__version__)  # Check PyTorch version
print(torch.version.cuda)  # Check the CUDA version PyTorch is using


2.4.0
12.3


In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [48]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [49]:
import torch
import os
from sklearn.metrics import accuracy_score


# Define checkpoint directory
checkpoint_dir = '/kaggle/working/checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Initialize variables to track the best validation loss
best_val_loss = float('inf')

for epoch in tqdm(range(1, epochs + 1)):
    
    model.train()
    
    loss_train_total = 0
    correct_predictions = 0  # To keep track of correct predictions
    total_predictions = 0     # To keep track of total predictions
    
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        
        # Get predictions
        preds = torch.argmax(outputs[1], dim=1)  # Assuming the logits are in outputs[1]
        
        # Update correct and total counts
        correct_predictions += (preds == batch[2]).sum().item()
        total_predictions += batch[2].size(0)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total / len(dataloader_train)
    training_accuracy = correct_predictions / total_predictions  # Calculate training accuracy
    
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training Accuracy: {training_accuracy:.3f}')  # Print training accuracy
    
    # Evaluate on the validation set
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    
    # Calculate validation accuracy
    predictions = np.argmax(predictions, axis=1).flatten()
    val_accuracy = accuracy_score(true_vals.flatten(), predictions)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
    tqdm.write(f'Validation Accuracy: {val_accuracy:.3f}')
    
    # Save checkpoint if the validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': val_loss
        }, checkpoint_path)
        tqdm.write(f'Checkpoint saved at {checkpoint_path}')

# Save the final model after training
final_model_path = './final_model.pth'
torch.save(model.state_dict(), final_model_path)
tqdm.write(f'Final model saved at {final_model_path}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.4779462190769545
Training Accuracy: 0.811
Validation loss: 0.2879312475025654
F1 Score (weighted): 0.9032234747376024
Validation Accuracy: 0.903
Checkpoint saved at /kaggle/working/checkpoints/checkpoint_epoch_1.pth


Epoch 2:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.22503609461884422
Training Accuracy: 0.931
Validation loss: 0.23646268107263105
F1 Score (weighted): 0.9288731503580463
Validation Accuracy: 0.930
Checkpoint saved at /kaggle/working/checkpoints/checkpoint_epoch_2.pth


Epoch 3:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.15129001614219661
Training Accuracy: 0.959
Validation loss: 0.20150324574006456
F1 Score (weighted): 0.9454248551490277
Validation Accuracy: 0.945
Checkpoint saved at /kaggle/working/checkpoints/checkpoint_epoch_3.pth


Epoch 4:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.1100906263880317
Training Accuracy: 0.973
Validation loss: 0.22383990299887954
F1 Score (weighted): 0.9505480205397512
Validation Accuracy: 0.950


Epoch 5:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.07873798537725574
Training Accuracy: 0.981
Validation loss: 0.23612968459326242
F1 Score (weighted): 0.9565604060342867
Validation Accuracy: 0.957


Epoch 6:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.057735040192465475
Training Accuracy: 0.987
Validation loss: 0.2471530908549903
F1 Score (weighted): 0.9540071232635922
Validation Accuracy: 0.954


Epoch 7:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.04003260894114001
Training Accuracy: 0.991
Validation loss: 0.2712369043621168
F1 Score (weighted): 0.9529113506807391
Validation Accuracy: 0.953


Epoch 8:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.02979317224786905
Training Accuracy: 0.993
Validation loss: 0.2767969179465061
F1 Score (weighted): 0.9590055221894286
Validation Accuracy: 0.959


Epoch 9:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.02045490874422636
Training Accuracy: 0.995
Validation loss: 0.305193204352233
F1 Score (weighted): 0.954467597157204
Validation Accuracy: 0.954


Epoch 10:   0%|          | 0/1974 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.01615429047543213
Training Accuracy: 0.996
Validation loss: 0.31359534815007856
F1 Score (weighted): 0.9556063755241423
Validation Accuracy: 0.956
Final model saved at ./final_model.pth


##  Loading and Evaluating our Model

In [50]:
_, predictions, true_vals = evaluate(dataloader_val)

In [51]:
accuracy_per_class(predictions, true_vals )

Class: -1
Accuracy: 1139/1242

Class: 0
Accuracy: 1916/1956

Class: 1
Accuracy: 2271/2375



In [52]:
from sklearn.metrics import classification_report

# Calculate validation accuracy
predictions = np.argmax(predictions, axis=1).flatten()
true_vals = true_vals.flatten()

# Generate classification report
report = classification_report(true_vals, predictions, target_names=['Class 0', 'Class 1', 'Class 2'])  # Change class names as per your dataset
print(report)

              precision    recall  f1-score   support

     Class 0       0.93      0.92      0.92      1242
     Class 1       0.97      0.98      0.98      1956
     Class 2       0.96      0.96      0.96      2375

    accuracy                           0.96      5573
   macro avg       0.95      0.95      0.95      5573
weighted avg       0.96      0.96      0.96      5573

