# Objective

+ Build a sentiment analysis model with huggingface (HF) models (Includes pre-trained & fine-tuning)
+ Using HF api's instead of native pytorch


In [1]:
# !pip install transformers
# !pip install torch

In [2]:
import transformers
import gc
import torch
# import mlflow.pytorch
import random
import numpy as np
import pandas as pd
# import seaborn as sns
# from pylab import rcParams
import matplotlib.pyplot as plt
import torch.nn.functional as F

from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import (BertModel, \
BertTokenizer, \
BertConfig,\
AdamW, \
get_linear_schedule_with_warmup, \
pipeline, \
BertForSequenceClassification, \
Trainer, \
TrainingArguments
)

get_ipython().run_line_magic('matplotlib', 'inline')

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

In [5]:
classifier('I’ve been signed up for a year and haven’t received a single sample for anything. \
I’d never complain about “free” unless “free” meant nothing at all. lol')

[{'label': 'NEGATIVE', 'score': 0.9647619724273682}]

In [3]:
SEED = 2021

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
df_org = pd.read_csv("./complaints_small.csv", nrows=5000)
print(df_org.shape)
df_org.head()

(5000, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-06-13,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,CAPITAL ONE FINANCIAL CORPORATION,PA,186XX,,Consent not provided,Web,2019-06-13,Closed with explanation,Yes,,3274605
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257
2,2019-04-01,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,19067,,Consent not provided,Web,2019-04-01,Closed with explanation,Yes,,3198225
3,2021-10-06,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"EQUIFAX, INC.",MD,21207,,,Web,2021-10-06,Closed with explanation,Yes,,4783820
4,2019-08-08,Mortgage,Conventional home mortgage,Trouble during payment process,,,Company has responded to the consumer and the ...,"FLAGSTAR BANK, FSB",ID,83706,,,Referral,2019-08-15,Closed with explanation,Yes,,3342290


In [5]:
df = df_org.loc[(df_org['Consumer complaint narrative'].notnull()), ['Consumer complaint narrative', 'Product']].copy().reset_index().drop('index', axis = 1)
df.columns = ['text', 'target']
print('Dataframe size - ', df.shape)
df.head()
        

Dataframe size -  (2593, 2)


Unnamed: 0,text,target
0,I contacted Ally on Friday XX/XX/XXXX after fa...,Vehicle loan or lease
1,Hello This complaint is against the three cred...,"Credit reporting, credit repair services, or o..."
2,I am a victim of Identity Theft & currently ha...,"Credit reporting, credit repair services, or o..."
3,Two accounts are still on my credit history af...,"Credit reporting, credit repair services, or o..."
4,Receiving daily telephone call ( s ) from XXXX...,"Credit reporting, credit repair services, or o..."


In [6]:
df.target.value_counts()

Credit reporting, credit repair services, or other personal consumer reports    1118
Debt collection                                                                  474
Credit card or prepaid card                                                      285
Mortgage                                                                         242
Checking or savings account                                                      181
Student loan                                                                     116
Vehicle loan or lease                                                             63
Money transfer, virtual currency, or money service                                57
Payday loan, title loan, or personal loan                                         44
Credit reporting                                                                   4
Consumer Loan                                                                      3
Money transfers                                                  

In [28]:
unique_labels = list(df.target.value_counts().index)
unique_labels

['Credit reporting, credit repair services, or other personal consumer reports',
 'Debt collection',
 'Credit card or prepaid card',
 'Mortgage',
 'Checking or savings account',
 'Student loan',
 'Vehicle loan or lease',
 'Money transfer, virtual currency, or money service',
 'Payday loan, title loan, or personal loan',
 'Credit reporting',
 'Consumer Loan',
 'Money transfers',
 'Credit card',
 'Bank account or service']

In [42]:
label_dict = {}
for i in range(len(unique_labels)):
    label_dict[unique_labels[i]] = i

label_dict

{'Credit reporting, credit repair services, or other personal consumer reports': 0,
 'Debt collection': 1,
 'Credit card or prepaid card': 2,
 'Mortgage': 3,
 'Checking or savings account': 4,
 'Student loan': 5,
 'Vehicle loan or lease': 6,
 'Money transfer, virtual currency, or money service': 7,
 'Payday loan, title loan, or personal loan': 8,
 'Credit reporting': 9,
 'Consumer Loan': 10,
 'Money transfers': 11,
 'Credit card': 12,
 'Bank account or service': 13}

In [44]:
df.target = df.target.replace(label_dict)
df.head()

Unnamed: 0,text,target
0,I contacted Ally on Friday XX/XX/XXXX after fa...,6
1,Hello This complaint is against the three cred...,0
2,I am a victim of Identity Theft & currently ha...,0
3,Two accounts are still on my credit history af...,0
4,Receiving daily telephone call ( s ) from XXXX...,0


In [45]:
X = list(df['text'])
y = list(df['target'])

In [46]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)


In [47]:
class_names = np.unique(df.target)
class_names

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
      dtype=int64)

In [48]:
N_CLASSES = len(class_names)
MAX_LEN = 160

TEST_SIZE = 0.1
VAL_SIZE = 0.5

BATCH_SIZE = 8
DROPOUT = 0.3
EPOCHS = 1


MODEL_CLASSES = {'bert': (BertConfig, BertForSequenceClassification, BertTokenizer, 'bert-base-cased')}

SELECTED_MODEL_CLASS = 'bert'

In [49]:
config_class, model_class, tokenizer_class, pretrained_model = MODEL_CLASSES[SELECTED_MODEL_CLASS]

In [50]:
tokenizer = tokenizer_class.from_pretrained(pretrained_model, return_dict=False)

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at C:\Users\Karthik/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at C:\Users\Karthik/.cache\huggingface\transformers\ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at C:\Users\Karthik/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16

In [51]:
model = model_class.from_pretrained(\
pretrained_model, # Use the 12-layer BERT model, with an cased vocab.\
num_labels = N_CLASSES, # The number of output labels--2 for binary classification.\
output_attentions = False, # Whether the model returns attentions weights.\
output_hidden_states = False, # Whether the model returns all hidden-states.\
)


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\Karthik/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    

In [None]:
# Tell pytorch to run this model on the GPU.
# model.cuda()

In [52]:
train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LEN)
val_encodings = tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LEN)

In [53]:
len(val_encodings['input_ids'][0])

160

In [54]:
class PrepareDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [55]:
# convert our tokenized data into a torch Dataset
train_dataset = PrepareDataset(train_encodings, y_train)
val_dataset = PrepareDataset(val_encodings, y_val)

In [56]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) # Gets the rowwise maximum value position
    # calculate accuracy using sklearn's function
    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    recall = recall_score(y_true=labels, y_pred=preds, average='macro')
    precision = precision_score(y_true=labels, y_pred=preds, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 


In [57]:
training_args = TrainingArguments( \
	output_dir='./results', \
	num_train_epochs=EPOCHS,\
	per_device_train_batch_size=BATCH_SIZE, \
	per_device_eval_batch_size=BATCH_SIZE, \
	warmup_steps=100, \
	weight_decay=0.01, \
	logging_dir='./logs', \
	load_best_model_at_end=True, \
	evaluation_strategy="steps", \
	logging_steps=100, \
	save_steps=100 \
)
# warmup_steps: Number of steps used for a linear warmup from 0 to learning_rate
# evaluation_strategy: Number of update steps between two evaluations

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [58]:
trainer = Trainer(
model=model, \
args=training_args, \
train_dataset=train_dataset, \
eval_dataset=val_dataset,\
compute_metrics=compute_metrics, \
)


In [59]:
gc.collect()
torch.cuda.empty_cache()


In [60]:
# mlflow.end_run()
trainer.train()

# The loss function here is CrossEntropyLoss = -summation(ti * log(pi)) where ti is the binary truth label and pi is the softmax probability 

***** Running training *****
  Num examples = 2074
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 260


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,2.0388,1.424462,0.589595,0.297917,0.279778,0.246077
200,1.2049,1.030066,0.693642,0.346009,0.371583,0.355845


***** Running Evaluation *****
  Num examples = 519
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results\checkpoint-100
Configuration saved in ./results\checkpoint-100\config.json
Model weights saved in ./results\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 519
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results\checkpoint-200
Configuration saved in ./results\checkpoint-200\config.json
Model weights saved in ./results\checkpoint-200\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-200 (score: 1.030065655708313).


TrainOutput(global_step=260, training_loss=1.4783022953913763, metrics={'train_runtime': 3427.7384, 'train_samples_per_second': 0.605, 'train_steps_per_second': 0.076, 'total_flos': 170547226072320.0, 'train_loss': 1.4783022953913763, 'epoch': 1.0})

In [61]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 519
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.030065655708313,
 'eval_accuracy': 0.6936416184971098,
 'eval_precision': 0.3460093604668385,
 'eval_recall': 0.371582907127797,
 'eval_f1': 0.3558451522345688,
 'eval_runtime': 223.1035,
 'eval_samples_per_second': 2.326,
 'eval_steps_per_second': 0.291,
 'epoch': 1.0}

## Save model

In [63]:
# saving the fine tuned model & tokenizer

model_path = "sentiment-"+pretrained_model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


Configuration saved in sentiment-bert-base-cased\config.json
Model weights saved in sentiment-bert-base-cased\pytorch_model.bin
tokenizer config file saved in sentiment-bert-base-cased\tokenizer_config.json
Special tokens file saved in sentiment-bert-base-cased\special_tokens_map.json


('sentiment-bert-base-cased\\tokenizer_config.json',
 'sentiment-bert-base-cased\\special_tokens_map.json',
 'sentiment-bert-base-cased\\vocab.txt',
 'sentiment-bert-base-cased\\added_tokens.json')

## Evaluation

In [64]:
model_load = model_class.from_pretrained(model_path, num_labels=N_CLASSES)

loading configuration file sentiment-bert-base-cased\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "

## Predicting on Raw Text

In [65]:
config = {
'MAX_LEN':MAX_LEN,
'TARGETS':class_names
}

In [67]:
def get_prediction(text, model, config):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=config['MAX_LEN'], return_tensors="pt")
    tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LEN)
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return config['TARGETS'][probs.argmax()], probs



In [68]:
review_text = "I love completing my todos! Best app ever!!!"

In [69]:
get_prediction(review_text, model_load, config)

(0,
 tensor([[0.4695, 0.1020, 0.1665, 0.0459, 0.0616, 0.0516, 0.0276, 0.0164, 0.0144,
          0.0071, 0.0168, 0.0064, 0.0079, 0.0061]], grad_fn=<SoftmaxBackward>))