# Sentiment Analysis

## Fine-tuned Sentiment Analysis Model with Training Data

In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import torch

from datasets import load_dataset, Dataset, ClassLabel, load_from_disk, DatasetDict, load_metric
from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read the full data
dataset = load_dataset("emotion", "default", split = 'train')
full_df = pd.DataFrame(dataset, columns = ['text', 'label'])
full_df = full_df[0:10000]
# full_df = pd.read_csv('sample.csv')

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


In [3]:
text_col = 'text'
label_col = 'label'
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [4]:
alldata_ds = Dataset.from_pandas(full_df)
alldata_ds = alldata_ds.class_encode_column(label_col)
data_ds = alldata_ds.train_test_split(test_size=0.4, seed=1)

Stringifying the column:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
# define tokenizing function
def tokenize_inputs(text):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer(text[text_col], truncation=True)
    
# do the tokenizing using map function
tokenized_ds = data_ds.map(tokenize_inputs, batched=True,
                           remove_columns = list(set(full_df.columns.to_list()).difference(set([text_col, label_col]))) )

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [6]:
no_classes = data_ds['train'].features[label_col].num_classes
id2label = {ind:label for ind, label in enumerate(data_ds['train'].features[label_col].names)}
label2id = {label:ind for ind, label in id2label.items()}

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels = no_classes,
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
training_arg = TrainingArguments("test-trainer",
                                 logging_strategy='epoch',
                                 evaluation_strategy = 'epoch',
                                 save_strategy = 'epoch', 
                                 load_best_model_at_end = True,
                                 metric_for_best_model='fscore',
                                 greater_is_better=True,
                                 report_to = 'all',
                                 per_device_train_batch_size = 8,
                                 per_device_eval_batch_size = 8, 
                                 num_train_epochs = 3,
                                 seed = 42
                                 )

PyTorch: setting up devices


In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    #get predictions by using index of max logit
    predictions = np.argmax(logits, axis=-1)
    
    #calculate classification report
    perfs = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)
    perf_dict = dict(zip(['precision', 'recall', 'fscore'], perfs[:3]))
    
    #return dictionary
    return perf_dict

In [23]:
trainer = Trainer(model = model,
                  args=training_arg,
                  data_collator = data_collator,
                  tokenizer=tokenizer,
                  train_dataset = tokenized_ds['train'],
                  eval_dataset = tokenized_ds['test'],
                  compute_metrics = compute_metrics
                  )

In [24]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 282


Epoch,Training Loss,Validation Loss,Precision,Recall,Fscore
1,0.1216,0.293881,0.893833,0.869976,0.880484
2,0.0587,0.31774,0.898773,0.878058,0.886347
3,0.0349,0.291555,0.888525,0.89235,0.890235


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 64
Saving model checkpoint to test-trainer/checkpoint-94
Configuration saved in test-trainer/checkpoint-94/config.json
Model weights saved in test-trainer/checkpoint-94/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-94/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-94/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 6

TrainOutput(global_step=282, training_loss=0.07175415035680677, metrics={'train_runtime': 90.901, 'train_samples_per_second': 198.018, 'train_steps_per_second': 3.102, 'total_flos': 525204972676224.0, 'train_loss': 0.07175415035680677, 'epoch': 3.0})

In [25]:
eval_ds = trainer.evaluate(tokenized_ds['test'])
print(eval_ds)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 64


{'eval_loss': 0.29155483841896057, 'eval_precision': 0.8885248590841356, 'eval_recall': 0.8923499028752908, 'eval_fscore': 0.8902352553671221, 'eval_runtime': 6.5057, 'eval_samples_per_second': 614.847, 'eval_steps_per_second': 9.684, 'epoch': 3.0}


In [None]:
# model.save_pretrained('./model/')

## Using Specified Pipeline without Training Data

In [81]:
dataset = load_dataset("emotion", "default", split = 'validation')
full_df = pd.DataFrame(dataset, columns = ['text', 'label'])
full_df = full_df[0:100]
full_df.loc[full_df['label'] == 0, 'label_str'] = 'sadness'
full_df.loc[full_df['label'] == 1, 'label_str'] = 'joy'
full_df.loc[full_df['label'] == 2, 'label_str'] = 'love'
full_df.loc[full_df['label'] == 3, 'label_str'] = 'anger'
full_df.loc[full_df['label'] == 4, 'label_str'] = 'fear'
full_df.loc[full_df['label'] == 5, 'label_str'] = 'surprise'
full_df.info()
# full_df = pd.read_csv('sample.csv')

W0907 00:31:14.453507 140547989231424 builder.py:412] Using custom data configuration default
W0907 00:31:14.456440 140547989231424 builder.py:577] Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       100 non-null    object
 1   label      100 non-null    int64 
 2   label_str  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [84]:
full_df['label_str'].value_counts(normalize=True)

sadness    0.39
joy        0.28
anger      0.16
love       0.13
fear       0.04
Name: label_str, dtype: float64

In [85]:
def run_sentiment_analysis(df, mypipeline:str, mylabels:list, text_col:str):
    """
    run sentiment analysis on text using specified pipeline
    
    Args:
        df: data frame
        mypipeline (str): specified pipeline 
        mylabes (list): list of specified labels
        text_col (str): name of text column
        
    Returns:
        predictions (list): predicted labels with the highest scores
        scores (list): sentiment analysis scores for the predictions
    """
    start_time = time.time()
    classifier = pipeline(mypipeline, device = 0)
    candidate_labels = mylabels
    preds = [classifier(sequence, candidate_labels) for sequence in df[text_col].tolist()]
    predictions = [pred['labels'][0] for pred in preds]
    scores = [pred['scores'][0] for pred in preds]
    print(time.time() - start_time, 'seconds')
    return predictions, scores

In [86]:
predictions, scores = run_sentiment_analysis(df = full_df, 
                                             mypipeline = "zero-shot-classification", 
                                             mylabels = ["sadness", "joy", "love", "anger", "fear", "surprise"], 
                                             text_col = 'text')

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)
loading configuration file https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/980f2be6bd282c5079e99199d7554cfd13000433ed0fdc527e7def799e5738fe.4fdc7ce6768977d347b32986aff152e26fcebbda34ef89ac9b114971d0342e09
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_la

18.893474340438843 seconds


In [87]:
print(classification_report(full_df['label_str'], predictions))

              precision    recall  f1-score   support

       anger       0.90      0.56      0.69        16
        fear       0.00      0.00      0.00         4
         joy       0.89      0.29      0.43        28
        love       0.50      0.31      0.38        13
     sadness       0.82      0.46      0.59        39
    surprise       0.00      0.00      0.00         0

    accuracy                           0.39       100
   macro avg       0.52      0.27      0.35       100
weighted avg       0.78      0.39      0.51       100



In [89]:
df = full_df.copy()
df['sentiment'] = predictions
df['score'] = scores
df.head(10)

Unnamed: 0,text,label,label_str,sentiment,score
0,im feeling quite sad and sorry for myself but ...,0,sadness,sadness,0.942836
1,i feel like i am still looking at a blank canv...,0,sadness,surprise,0.771841
2,i feel like a faithful servant,2,love,love,0.752808
3,i am just feeling cranky and blue,3,anger,sadness,0.552044
4,i can have for a treat or if i am feeling festive,1,joy,joy,0.549333
5,i start to feel more appreciative of what god ...,1,joy,surprise,0.435614
6,i am feeling more confident that we will be ab...,1,joy,joy,0.328114
7,i feel incredibly lucky just to be able to tal...,1,joy,surprise,0.391188
8,i feel less keen about the army every day,1,joy,surprise,0.376403
9,i feel dirty and ashamed for saying that,0,sadness,surprise,0.336696
