# Sentiment Analysis

## Fine-tuned Sentiment Analysis Model with Training Data

In [2]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import torch

from datasets import load_dataset, Dataset, ClassLabel, load_from_disk, DatasetDict, load_metric
from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings('ignore')

2022-09-13 03:56:39.868427: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-13 03:56:40.040300: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-13 03:56:40.573190: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/compat/lib.real:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-09-13 03:56:40.573267: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load 

In [57]:
# read the full data
# dataset = load_dataset("emotion", "default", split = 'train')
# full_df = pd.DataFrame(dataset, columns = ['text', 'label'])
# full_df = full_df[0:10000]
full_df = pd.read_csv('transcript_may_collated_encoded.csv')
full_df = full_df.drop(columns='Unnamed: 0')
full_df.rename(columns={'accepted_flg':'label', 'encrypted_collated_transcription':'text'}, inplace=True)
full_df = full_df.dropna()
full_df['label'] = full_df['label'].astype(str)
full_df = full_df[0:10000]
full_df.head()

Unnamed: 0,ID,sales_offer_date,label,text
0,e87397307d1da391a6a5cdce07d36615e212dd2112249e...,2022-05-31,False,I am Is Protech Contacting Support to technica...
1,1d7b2a81926ca34e042635eb664007b4bd64c7853b529f...,2022-05-20,False,Hi there. Thank you for calling [CLIENT] Tech ...
2,f9366cba76b738bf36e28d11398bc4d40e34be1ec2ca88...,2022-05-02,False,Thank you for calling [CLIENT]. My name is [NA...
3,ac3f93e0220b4b416d63f3b5859386ca77f7862a8afbb7...,2022-05-06,False,Thank you for calling [CLIENT] Tech Coach. My ...
4,18adf38a3b21b9c0736af624a18749b95b8896282876a2...,2022-05-02,False,"Hi, Thank you for calling [CLIENT] Tech Coach...."


In [46]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266230 entries, 0 to 266496
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ID                266230 non-null  object
 1   sales_offer_date  266230 non-null  object
 2   label             266230 non-null  object
 3   text              266230 non-null  object
dtypes: object(4)
memory usage: 10.2+ MB


In [58]:
text_col = 'text'
label_col = 'label'
checkpoint = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(checkpoint, is_split_into_words = True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [59]:
alldata_ds = Dataset.from_pandas(full_df)
alldata_ds = alldata_ds.class_encode_column(label_col)
data_ds = alldata_ds.train_test_split(test_size=0.4, seed=1)

Casting to class labels:   0%|          | 0/10 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [60]:
# define tokenizing function
def tokenize_inputs(text):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, is_split_into_words = True)
    return tokenizer(text[text_col], truncation=True)
    
# do the tokenizing using map function
tokenized_ds = data_ds.map(tokenize_inputs, batched=True,
                           remove_columns = list(set(full_df.columns.to_list()).difference(set([text_col, label_col]))))

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [61]:
no_classes = data_ds['train'].features[label_col].num_classes
id2label = {ind:label for ind, label in enumerate(data_ds['train'].features[label_col].names)}
label2id = {label:ind for ind, label in id2label.items()}

In [68]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels = no_classes,
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "False",
    "1": "True"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "False": 0,
    "True": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "u

In [69]:
training_arg = TrainingArguments("test-trainer",
                                 logging_strategy='epoch',
                                 evaluation_strategy = 'epoch',
                                 save_strategy = 'epoch', 
                                 load_best_model_at_end = True,
                                 metric_for_best_model='fscore',
                                 greater_is_better=True,
                                 report_to = 'all',
                                 per_device_train_batch_size = 16,
                                 per_device_eval_batch_size = 16, 
                                 num_train_epochs = 3,
                                 seed = 42
                                 )

PyTorch: setting up devices


In [70]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    #get predictions by using index of max logit
    predictions = np.argmax(logits, axis=-1)
    
    #calculate classification report
    perfs = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)
    perf_dict = dict(zip(['precision', 'recall', 'fscore'], perfs[:3]))
    
    #return dictionary
    return perf_dict

In [71]:
trainer = Trainer(model = model,
                  args=training_arg,
                  data_collator = data_collator,
                  tokenizer=tokenizer,
                  train_dataset = tokenized_ds['train'],
                  eval_dataset = tokenized_ds['test'],
                  compute_metrics = compute_metrics
                  )

In [72]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 141


Epoch,Training Loss,Validation Loss,Precision,Recall,Fscore
1,0.2576,0.245592,0.466625,0.5,0.482736
2,0.2457,0.245033,0.466625,0.5,0.482736
3,0.2432,0.244739,0.466625,0.5,0.482736


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 128
Saving model checkpoint to test-trainer/checkpoint-47
Configuration saved in test-trainer/checkpoint-47/config.json
Model weights saved in test-trainer/checkpoint-47/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-47/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-47/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this m

TrainOutput(global_step=141, training_loss=0.24879091682163537, metrics={'train_runtime': 87.1411, 'train_samples_per_second': 206.562, 'train_steps_per_second': 1.618, 'total_flos': 4735998996480000.0, 'train_loss': 0.24879091682163537, 'epoch': 3.0})

In [73]:
eval_ds = trainer.evaluate(tokenized_ds['test'])
print(eval_ds)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 128


{'eval_loss': 0.24559175968170166, 'eval_precision': 0.466625, 'eval_recall': 0.5, 'eval_fscore': 0.48273632484158796, 'eval_runtime': 6.1037, 'eval_samples_per_second': 655.339, 'eval_steps_per_second': 5.243, 'epoch': 3.0}


In [None]:
# model.save_pretrained('./model/')

## Using Specified Pipeline without Training Data

In [3]:
# dataset = load_dataset("emotion", "default", split = 'validation')
# full_df = pd.DataFrame(dataset, columns = ['text', 'label'])
# full_df = full_df[0:100]
# full_df.loc[full_df['label'] == 0, 'label_str'] = 'sadness'
# full_df.loc[full_df['label'] == 1, 'label_str'] = 'joy'
# full_df.loc[full_df['label'] == 2, 'label_str'] = 'love'
# full_df.loc[full_df['label'] == 3, 'label_str'] = 'anger'
# full_df.loc[full_df['label'] == 4, 'label_str'] = 'fear'
# full_df.loc[full_df['label'] == 5, 'label_str'] = 'surprise'
# full_df.info()
# full_df = pd.read_csv('sample.csv')
full_df = pd.read_csv('transcript_may_encoded_100.csv', index_col='Unnamed: 0')
full_df.rename(columns={'accepted_flg':'label', 'encrypted_collated_transcription':'text'}, inplace=True)

In [4]:
full_df = full_df[['label', 'text']]
full_df.loc[full_df['label'] == False, 'label_str'] = 'Negative'
full_df.loc[full_df['label'] == True, 'label_str'] = 'Positive'

In [5]:
full_df.head()

Unnamed: 0,label,text,label_str
0,False,Thank you for calling [CLIENT] Tech for speaki...,Negative
1,False,[CLIENT] tech My name is [NAME]. May I have yo...,Negative
4,False,Hi. Thank you for call. tech you're speaking w...,Negative
5,False,Thank you for calling [CLIENT] Tech Coach. My ...,Negative
6,False,Hi. Thank you for calling [CLIENT] Barcode Coa...,Negative


In [6]:
full_df['label_str'].value_counts(normalize=True)

Negative    0.913043
Positive    0.086957
Name: label_str, dtype: float64

In [7]:
def run_sentiment_analysis(df, mypipeline:str, mylabels:list, text_col:str):
    """
    run sentiment analysis on text using specified pipeline
    
    Args:
        df: data frame
        mypipeline (str): specified pipeline 
        mylabes (list): list of specified labels
        text_col (str): name of text column
        
    Returns:
        predictions (list): predicted labels with the highest scores
        scores (list): sentiment analysis scores for the predictions
    """
    start_time = time.time()
    classifier = pipeline(mypipeline, device = 0)
    candidate_labels = mylabels
    preds = [classifier(sequence, candidate_labels) for sequence in df[text_col].tolist()]
    predictions = [pred['labels'][0] for pred in preds]
    scores = [pred['scores'][0] for pred in preds]
    print(time.time() - start_time, 'seconds')
    return predictions, scores

In [9]:
predictions, scores = run_sentiment_analysis(df = full_df, 
                                             mypipeline = "zero-shot-classification",
                                             mylabels = ['Positive', 'Negative'],
#                                              mylabels = ["sadness", "joy", "love", "anger", "fear", "surprise"], 
                                             text_col = 'text')

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


17.83779811859131 seconds


In [10]:
print(classification_report(full_df['label_str'], predictions))

              precision    recall  f1-score   support

    Negative       1.00      0.02      0.05        42
    Positive       0.09      1.00      0.16         4

    accuracy                           0.11        46
   macro avg       0.54      0.51      0.10        46
weighted avg       0.92      0.11      0.06        46



In [11]:
df = full_df.copy()
df['sentiment'] = predictions
df['score'] = scores
df.head(10)

Unnamed: 0,label,text,label_str,sentiment,score
0,False,Thank you for calling [CLIENT] Tech for speaki...,Negative,Positive,0.74694
1,False,[CLIENT] tech My name is [NAME]. May I have yo...,Negative,Positive,0.582443
4,False,Hi. Thank you for call. tech you're speaking w...,Negative,Positive,0.573833
5,False,Thank you for calling [CLIENT] Tech Coach. My ...,Negative,Positive,0.757156
6,False,Hi. Thank you for calling [CLIENT] Barcode Coa...,Negative,Positive,0.703341
7,False,Thank you for calling Coach. This is Sean spea...,Negative,Positive,0.584938
9,True,Thank you for calling [CLIENT] Tech Coach. Ben...,Positive,Positive,0.582232
10,False,Morning. Thanks for [CLIENT] Tech Coach. My na...,Negative,Positive,0.776038
11,False,thank Hello? Hi. Thank you for calling [CLIENT...,Negative,Positive,0.552441
12,False,Hi. Thank you for [CLIENT] Tech Coach. My name...,Negative,Positive,0.590711


In [14]:
df.to_csv('sentiment_output.csv')