<a href="https://colab.research.google.com/github/tekliyetamiru/Fine-Tuning-BERT-for-Multi-Class-Sentiment-Classification-for-Twitter-Tweets/blob/main/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Multi-Class Sentiment Recognition for Twitter Tweets using HuggingFace Transformers**

In [None]:
!pip install -U transformers
!pip install -U accelerates
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install seaborn --upgrade

In [None]:
!pip install -U pyarrow datasets pandas

## **Loading Data from Github**

In [37]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/twitter_multi_class_sentiment.csv")
df

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
...,...,...,...
15995,i just had a very brief time in the beanbag an...,0,sadness
15996,i am now turning and i feel pathetic that i am...,0,sadness
15997,i feel strong and good overall,1,joy
15998,i feel like this was such a rude comment and i...,3,anger


In [38]:
df.isnull().sum()

text          0
label         0
label_name    0
dtype: int64

In [39]:
df['label'].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

In [40]:
df.describe()

Unnamed: 0,label
count,16000.0
mean,1.565937
std,1.50143
min,0.0
25%,0.0
50%,1.0
75%,3.0
max,5.0


# Tokenization

In [41]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

text = "I love machine learning! Tokenization is awesome!!"
encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [101, 1045, 2293, 3698, 4083, 999, 19204, 3989, 2003, 12476, 999, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [42]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.3, stratify = df['label_name'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label_name'])
train.shape, test.shape, validation.shape

((11200, 3), (3200, 3), (1600, 3))

In [43]:
from datasets import Dataset, DatasetDict

dataset=DatasetDict(
    {
    'train':Dataset.from_pandas(train,preserve_index=False),
    'test':Dataset.from_pandas(test,preserve_index=False),
    'validation':Dataset.from_pandas(validation,preserve_index=False)
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 11200
    })
    test: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 1600
    })
})

In [44]:
dataset['train'][0]

{'text': 'i mean i know how it feels that a person is valued by the family if s he gives money or food to the table',
 'label': 1,
 'label_name': 'joy'}

In [45]:
def tokenize(batch):
  temp = tokenizer(batch['text'],padding=True, truncation=True)
  return temp

tokenize(dataset['train'][:2])

{'input_ids': [[101, 1045, 2812, 1045, 2113, 2129, 2009, 5683, 2008, 1037, 2711, 2003, 11126, 2011, 1996, 2155, 2065, 1055, 2002, 3957, 2769, 2030, 2833, 2000, 1996, 2795, 102], [101, 1045, 2064, 2102, 2022, 1037, 17220, 2005, 2017, 1999, 1996, 2126, 1045, 2514, 1045, 2323, 1045, 2572, 2205, 5591, 2870, 102, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}

In [46]:
emotion_encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [47]:
label2id = {x['label_name']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}

label2id, id2label

({'joy': 1, 'sadness': 0, 'fear': 4, 'love': 2, 'anger': 3, 'surprise': 5},
 {1: 'joy', 0: 'sadness', 4: 'fear', 2: 'love', 3: 'anger', 5: 'surprise'})

In [48]:
from transformers import AutoModel
import torch

In [49]:
model = AutoModel.from_pretrained(model_ckpt)

In [50]:
from transformers import AutoModelForSequenceClassification,AutoConfig

num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_ckpt,label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
