<a href="https://colab.research.google.com/github/wahid028/Sentiment-Analysis/blob/main/BERT_with_PyTorch_Lightening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pytorch-lightning transformers torchmetrics

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.4/826.4 KB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
! pip install -q nltk spacy beautifulsoup4 regex

In [2]:
#install kaggle
!pip install -q kaggle

#upload the kaggle.json file
from google.colab import files
files.upload()

#create a kaggle directory
!mkdir ~/.kaggle

#copy the kaggle.json to kaggle directory
!cp kaggle.json ~/.kaggle/

#permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
!kaggle competitions download -c tweet-sentiment-extraction
!unzip tweet-sentiment-extraction.zip

Downloading tweet-sentiment-extraction.zip to /content
 72% 1.00M/1.39M [00:01<00:00, 960kB/s]
100% 1.39M/1.39M [00:01<00:00, 1.25MB/s]
Archive:  tweet-sentiment-extraction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import pytorch_lightning as pl
import torch

nltk.download("all")

from bs4 import BeautifulSoup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
from torchmetrics import Accuracy, Precision, Recall
from pytorch_lightning import Trainer

In [46]:
import torch.nn as nn
import torch.optim as optim

In [48]:
from transformers import BertTokenizer, BertForSequenceClassification

In [5]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
train = pd.read_csv('../content/train.csv')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:
# drop the rows with neutral sentiment as we are only interested in positive and negative sentiment
df_train = train[train['sentiment'] != 'neutral']
df_train = df_train.reset_index(drop=True)

In [9]:
df_train = df_train[['text','sentiment']]

In [10]:
#sentiment converter
def sentiment_ts(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'positive':
        return 1
    
df_train['label'] = df_train['sentiment'].apply(sentiment_ts)

In [11]:
train_new = df_train[['text','label']]
train_new.head(3)

Unnamed: 0,text,label
0,Sooo SAD I will miss you here in San Diego!!!,0
1,my boss is bullying me...,0
2,what interview! leave me alone,0


In [12]:

#check the NA values
train_new.isnull().sum()

text     0
label    0
dtype: int64

***For this experiemtn we are going to skip the pre-processing part to save our time

In [13]:
# X = train_new["text"]
# y = train_new["label"]

In [14]:
# # Split data into training and validation datasets
# train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['label']
        encoding = self.tokenizer.encode_plus(
            text, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt')
        return encoding['input_ids'][0], encoding['attention_mask'][0], label

In [49]:
# Define LightningModule class
class MyModel(pl.LightningModule):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
        self.loss_function = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        output = self(input_ids, attention_mask)
        loss = self.loss_function(output, label)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        output = self(input_ids, attention_mask)
        loss = self.loss_function(output, label)
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=2e-5)
        return optimizer

In [50]:
# Define hyperparameters
max_length = 128
batch_size = 32
num_labels = 2
epochs = 3

In [51]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [53]:
# Create datasets and data loaders for training and validation
train_dataset = MyDataset(train_new, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# val_dataset = MyDataset(val_texts, val_labels, tokenizer, max_length)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [54]:
train_dataset[5]

(tensor([  101,  4990,   999,  1029, 10166,  1012,  1012,  1012,  1057,  2074,
          2150, 14976,  1012,  2002,  5369,  1012,  1012,  1012,  1006,  2003,
          2008,  2825,   999,  1029,  1007,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [55]:
# Initialize model and trainer
model = MyModel(num_labels=num_labels)
trainer = pl.Trainer(max_epochs=epochs, gpus=torch.cuda.device_count())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [56]:
# Train model
trainer.fit(model, train_loader)

  rank_zero_warn(
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type                          | Params
----------------------------------------------------------------
0 | bert          | BertForSequenceClassification | 109 M 
1 | loss_function | CrossEntropyLoss              | 0     
----------------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
