<a href="https://colab.research.google.com/github/wahid028/Sentiment-Analysis/blob/main/BERT_with_PyTorch_Lightening_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pytorch-lightning transformers torchmetrics

In [2]:
! pip install -q nltk spacy beautifulsoup4 regex

In [3]:
#install kaggle
!pip install -q kaggle

#upload the kaggle.json file
from google.colab import files
files.upload()

#create a kaggle directory
!mkdir ~/.kaggle

#copy the kaggle.json to kaggle directory
!cp kaggle.json ~/.kaggle/

#permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (2).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [4]:
!kaggle competitions download -c tweet-sentiment-extraction
!unzip tweet-sentiment-extraction.zip

tweet-sentiment-extraction.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  tweet-sentiment-extraction.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [5]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim

# nltk.download("all")

from bs4 import BeautifulSoup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
# from transformers import BertModel, BertTokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from torchmetrics import Accuracy, Precision, Recall
from pytorch_lightning import Trainer

In [6]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
train = pd.read_csv('../content/train.csv')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:
# drop the rows with neutral sentiment as we are only interested in positive and negative sentiment
df_train = train[train['sentiment'] != 'neutral']
df_train = df_train.reset_index(drop=True)

In [9]:
df_train = df_train[['text','sentiment']]

In [10]:
#sentiment converter
def sentiment_ts(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'positive':
        return 1
    
df_train['label'] = df_train['sentiment'].apply(sentiment_ts)

In [11]:
train_new = df_train[['text','label']]
train_new.head(3)

Unnamed: 0,text,label
0,Sooo SAD I will miss you here in San Diego!!!,0
1,my boss is bullying me...,0
2,what interview! leave me alone,0


In [12]:
#check the NA values
train_new.isnull().sum()

text     0
label    0
dtype: int64

In [13]:
del train
del df_train

In [14]:
test = pd.read_csv('../content/test.csv')

# drop the rows with neutral sentiment as we are only interested in positive and negative sentiment
df_test = test[test['sentiment'] != 'neutral']
df_test = df_test.reset_index(drop=True)

df_test = df_test[['text','sentiment']]

#sentiment converter
def sentiment_ts(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'positive':
        return 1
    
df_test['label'] = df_test['sentiment'].apply(sentiment_ts)

test_data = df_test[['text','label']]
test_data.head(3)

Unnamed: 0,text,label
0,Shanghai is also really exciting (precisely -...,1
1,"Recession hit Veronique Branquinho, she has to...",0
2,happy bday!,1


In [15]:
del test
del df_test

In [16]:
#check the NA values
train_new.isnull().sum()

text     0
label    0
dtype: int64

In [17]:
test_data.isnull().sum()

text     0
label    0
dtype: int64

***For this experiemtn we are going to skip the pre-processing part to save our time

In [18]:
# Load data and split into train and val
train_data, val_data = train_test_split(train_new, test_size=0.2, random_state=42)

In [19]:
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['label']
        encoding = self.tokenizer.encode_plus(
            text, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt')
        return encoding['input_ids'][0], encoding['attention_mask'][0], label

In [20]:
# Define LightningModule class
class MyModel(pl.LightningModule):
    def __init__(self, num_labels):
        super().__init__()
        self.num_labels = num_labels
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
        self.loss_function = nn.CrossEntropyLoss()
           
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        output = self(input_ids, attention_mask)
        loss = self.loss_function(output, label)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        output = self(input_ids, attention_mask)
        loss = self.loss_function(output, label)
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        output = self(input_ids, attention_mask)
        loss = self.loss_function(output, label)
        self.log('test_loss', loss)
        preds = torch.argmax(output, dim=1)
        acc = (preds == label).float().mean()
        self.log('test_acc', acc)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer

In [21]:
# Define hyperparameters
learning_rate = 2e-5
max_length = 128
batch_size = 16
num_labels = 2
epochs = 3

In [22]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Create datasets and data loaders for training and validation
train_dataset = MyDataset(train_data, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

val_dataset = MyDataset(val_data, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

test_dataset = MyDataset(test_data, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [24]:
train_dataset[5]

(tensor([  101,  1045,  2293,  2115,  6045,   999,  2061,  4658,  1012,  2008,
          3504,  2066,  2009,  2001,  1037,  4569,  2154,  1012,  1998,  1045,
          2293,  2008,  2017,  2109,  1996,  2773,  1005, 13675, 17339,  2100,
          1005,  2074,  2085,   999,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [26]:
# Initialize the model
model = MyModel(num_labels)

# Load the best model checkpoint
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath='./',
    filename='best_model'
)

trainer = pl.Trainer(max_epochs=epochs, gpus=torch.cuda.device_count(), 
                     callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, val_loader)

# Load the best model checkpoint for testing
best_model = MyModel.load_from_checkpoint(checkpoint_callback.best_model_path, num_labels=num_labels)

# Test the model using the test loader
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)
trainer.test(best_model, test_loader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.9401140809059143
        test_loss           0.16429728269577026
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.16429728269577026, 'test_acc': 0.9401140809059143}]