276

**Install Libraries**

In [0]:
#!pip install transformers # transformers


**Import Modules**

In [0]:
# import modules

# Torch, Sklearn imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing # label encoder





## PyTorch Transformer
import transformers

## DistilBert
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertConfig


## others
import pandas as pd
import numpy as np
import json, re
import random
import uuid
from tqdm import tqdm_notebook


print(torch.__version__)
print(transformers.__version__)

1.3.1
2.3.0


In [0]:
# Install latest Tensorflow build
!pip install -q tf-nightly-2.0-preview

[K     |████████████████████████████████| 95.2MB 106kB/s 
[K     |████████████████████████████████| 450kB 77.4MB/s 
[K     |████████████████████████████████| 3.8MB 56.9MB/s 
[K     |████████████████████████████████| 81kB 14.3MB/s 
[31mERROR: google-colab 1.0.0 has requirement google-auth~=1.4.0, but you'll have google-auth 1.10.0 which is incompatible.[0m
[31mERROR: tb-nightly 2.1.0a20191206 has requirement grpcio>=1.24.3, but you'll have grpcio 1.15.0 which is incompatible.[0m
[?25h

In [0]:
# cuda advantage of the massive parallel computing, 
torch.cuda.is_available() # check cuda is available or not

True

In [0]:
# set seeding
def random_seeding(seed_value, use_cuda):
  random.seed(seed_value) # pyrhon random
  np.random.seed(seed_value) # numpy
  torch.manual_seed(seed_value) # torch
  if use_cuda: torch.cuda.manual_seed_all(seed_value) # cuda


use_cuda = torch.cuda.is_available()
random_seeding(350, use_cuda)

**Uploading csv file**

In [0]:
# upload csv file
from google.colab import files
uploaded = files.upload()
# agent_clf file name

MessageError: ignored

In [0]:
# pandas dataframe
import io
dataset = pd.read_csv(io.BytesIO(uploaded['Agent_Clf.csv']))
dataset.head()

Unnamed: 0,text,source_labs
0,hello yeah hello very good evening ma'am,others-greeting
1,customer is not responding,none
2,a customer is not responding forcefully i have...,none
3,forcefully i have to end the call a customer i...,none
4,as customer is not responding forcefully i hav...,none


In [0]:
len(dataset), dataset.groupby('labels').size()

(1078, labels
 <Agent>       421
 <Customer>    414
 <IVR>         243
 dtype: int64)

**Labels text to numbers: Label encoder**


In [0]:
# label_encoder object knows how to understand word labels 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels
dataset['labels']= label_encoder.fit_transform(dataset['labels']) 

In [0]:
dataset.head()

Unnamed: 0,texts,labels
0,to provide you with a world customer experienc...,2
1,thanks for being online priya,0
2,to provide you the worlds large customer exper...,2
3,to provide you with a world class customer exp...,2
4,with constantly engather we provide you with t...,2


In [0]:
len(dataset), dataset.groupby('labels').size() # 0-agent, 1-customer, 2 ivr

(1078, labels
 0    421
 1    414
 2    243
 dtype: int64)

In [0]:
len(list(set(dataset.labels)))

3

Model Configurations

Transformers, each model architecture is associated with 3 main types of classes:

**Model class**- to load a particular pre-train model

**Tokenizer class**- to pre-process the data and make it comptible with a particular model 

**Configuration class**- to load the configuration of a particular moel

these classes share a common class method **from_pretrained()**

Example: Bert architecture for text classification

model class - **BertForSequenceClassification**,
tokenizer class- **BertTokenizer**,
configuration class- **BertConfig**


In [0]:
model_type = 'distilbert' # from HuggingFace
## Distilbert
if model_type == 'distilbert':
    print("DistilBERT")
    config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
    config.num_labels = len(list(set(dataset.labels))) # number of classes in a problem varies
    config.n_layers = 2
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification(config)
    
print(config)

DistilBERT
{
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "max_position_embeddings": 512,
  "n_heads": 12,
  "n_layers": 2,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 30522
}



**Data pre-processing**

To match pretraining model, we have to format the model input sequence in a specific format, to do we have to tokenize the texts correctly

**Add Special Tokens**

Token [CLS] means the start of a sentence, stands for class [SEP] is for separating sentences for the next sentence prediction task.

The first token of every input sequence is the special classification token – [CLS]. This token is used in classification tasks as an aggregate of the entire sequence representation. It is ignored in non-classification tasks

BERT: [CLS] + tokens + [SEP] + padding
##
DistilBERT: [CLS] + tokens + [SEP] + padding
##
RoBERTa: [CLS] + prefix_space + tokens + [SEP] + padding
##
XLM: [CLS] + tokens + [SEP] + padding
##
XLNet: padding + [CLS] + tokens + [SEP]

**Add special tokens and Zero padding**

In [0]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [0]:
def prepare_features(seq_1, zero_pad = False, max_seq_length = 128):
    enc_text = tokenizer.encode_plus(seq_1, add_special_tokens=True, max_length=300) # add tokens
    if zero_pad:
        while len(enc_text['input_ids']) < max_seq_length:
            enc_text['input_ids'].append(0)
            enc_text['token_type_ids'].append(0)
    return enc_text

In [0]:
for i in range(len(dataset)):
  if len(dataset['text'][i]) > 340:
    print(i)

9876


**Prepare features on texts and labels**

In [1]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.texts[index]
        X = prepare_features(utterance, zero_pad = True)
        y =  int(self.data.labels[index])
        return np.array(X['input_ids']), np.array(X['token_type_ids']), np.array(y)
    
    def __len__(self):
        return self.len

NameError: ignored

Splitting the data

In [0]:
train_size = 0.8
train_dataset = dataset.sample(frac=train_size, random_state=200).reset_index(drop=True)
test_dataset = dataset.drop(train_dataset.index).reset_index(drop=True)

In [0]:
dataset.index, train_dataset.index, test_dataset.index

(RangeIndex(start=0, stop=1078, step=1),
 RangeIndex(start=0, stop=862, step=1),
 RangeIndex(start=0, stop=216, step=1))

In [0]:
dataset.shape, train_dataset.shape, test_dataset.shape

((1078, 2), (862, 2), (216, 2))

In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

**Dataloaders and Parameters**

In [0]:
### Dataloaders Parameters
params = {'batch_size': 8,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
loss_function = nn.CrossEntropyLoss()
learning_rate = 5e-06 
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
if torch.cuda.is_available():
    print("GPU is AVAILABLE!")
    model = model.cuda()

GPU is AVAILABLE!


In [0]:
ids, tokens, labels = next(iter(training_loader)) # iterated one element at a time
ids.shape, tokens.shape, labels

(torch.Size([8, 120]), torch.Size([8, 120]), tensor([1, 1, 2, 1, 2, 0, 0, 2]))

In [0]:
#ids

In [0]:
if model_type == 'distilbert':
    print(model_type)
    out = model.forward(ids.cuda())[0]

print(loss_function(out, labels.cuda()))
print(out.shape)

distilbert
tensor(1.1552, device='cuda:0', grad_fn=<NllLossBackward>)
torch.Size([8, 3])


**Training the model**

In [0]:
def train(model, epochs):
  max_epochs = epochs
  model = model.train()
  for epoch in tqdm_notebook(range(max_epochs)):
      print("EPOCH -- {}".format(epoch))
      correct = 0
      total = 0
      for i, (ids, tokens, labels) in enumerate(training_loader):
          optimizer.zero_grad()
          if torch.cuda.is_available():
              ids = ids.cuda()
              tokens = tokens.cuda()
              labels = labels.cuda()
      
          if model_type == 'distilbert':
              output = model.forward(ids)[0]

          loss = loss_function(output, labels)
          loss.backward()
          optimizer.step()

          _, predicted = torch.max(output.data, 1)
          total += labels.size(0)
          correct += (predicted.cpu() == labels.cpu()).sum()
      train_accuracy = 100.00 * correct.numpy() / total
      print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), train_accuracy))
  return "Training Finished"

In [0]:
train(model, 20) # 20 epochs

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

EPOCH -- 0
Iteration: 106. Loss: 1.0620448589324951. Accuracy: 38.3177570093458%
EPOCH -- 1
Iteration: 106. Loss: 1.082703948020935. Accuracy: 40.53738317757009%
EPOCH -- 2
Iteration: 106. Loss: 1.1414753198623657. Accuracy: 38.43457943925234%
EPOCH -- 3
Iteration: 106. Loss: 1.1016850471496582. Accuracy: 40.42056074766355%
EPOCH -- 4
Iteration: 106. Loss: 1.1040409803390503. Accuracy: 45.09345794392523%
EPOCH -- 5
Iteration: 106. Loss: 0.8610507249832153. Accuracy: 54.78971962616822%
EPOCH -- 6
Iteration: 106. Loss: 0.5304890871047974. Accuracy: 61.44859813084112%
EPOCH -- 7
Iteration: 106. Loss: 0.6868308782577515. Accuracy: 65.88785046728972%
EPOCH -- 8
Iteration: 106. Loss: 0.47892075777053833. Accuracy: 70.67757009345794%
EPOCH -- 9
Iteration: 106. Loss: 0.41788530349731445. Accuracy: 71.14485981308411%
EPOCH -- 10
Iteration: 106. Loss: 0.7538765668869019. Accuracy: 75.11682242990655%
EPOCH -- 11
Iteration: 106. Loss: 0.7186815738677979. Accuracy: 78.15420560747664%
EPOCH -- 12
It

'Training Finished'

In [0]:
def evaluate_accuracy(model):
  correct = 0
  total = 0
  num_classes = 3
  # Initialize the prediction and label lists(tensors)
  pred_list = torch.zeros(0,dtype=torch.long)
  label_list = torch.zeros(0,dtype=torch.long)
  for (ids, tokens, labels) in testing_loader:
      if torch.cuda.is_available():
          ids = ids.cuda()
          tokens = tokens.cuda()
          labels = labels.cuda()

      if model_type == 'distilbert':
          output = model.forward(ids)[0]

      _, predicted = torch.max(output.data, 1)

      # Append batch prediction results
      # torch.cat- concatenates along an existing dimension. and so the number of dimensions of the output is the same as the inputs.
      pred_list = torch.cat([pred_list, predicted.view(-1).cpu()])
      label_list = torch.cat([label_list, labels.view(-1).cpu()])

     

      total += labels.size(0)
      correct += (predicted.cpu() == labels.cpu()).sum()
  # Accuracy
  accuracy = 100.00 * correct.numpy() / total

  # Confusion matrix
  conf_matrix = confusion_matrix(label_list.numpy(), pred_list.numpy())

  # Classification_report
  classify_report = classification_report(label_list.numpy(), pred_list.numpy())
  print('classification report:', classify_report)

  return accuracy, conf_matrix

In [0]:
evaluate_accuracy(model) # seed-350

classification report:               precision    recall  f1-score   support

           0       0.90      0.89      0.89        81
           1       0.72      0.85      0.78        33
           2       0.99      0.94      0.96       102

    accuracy                           0.91       216
   macro avg       0.87      0.89      0.88       216
weighted avg       0.91      0.91      0.91       216



(90.74074074074075, array([[72,  9,  0],
        [ 4, 28,  1],
        [ 4,  2, 96]]))

In [0]:
evaluate_accuracy(model) # seed-30

classification report:               precision    recall  f1-score   support

           0       0.95      0.88      0.91        81
           1       0.71      0.91      0.80        33
           2       0.99      0.96      0.98       102

    accuracy                           0.92       216
   macro avg       0.88      0.92      0.90       216
weighted avg       0.93      0.92      0.92       216



(92.12962962962963, array([[71, 10,  0],
        [ 2, 30,  1],
        [ 2,  2, 98]]))

In [0]:
evaluate_accuracy(model) # seed-10

classification report:               precision    recall  f1-score   support

           0       0.93      0.93      0.93        81
           1       0.79      0.91      0.85        33
           2       0.98      0.93      0.95       102

    accuracy                           0.93       216
   macro avg       0.90      0.92      0.91       216
weighted avg       0.93      0.93      0.93       216



(92.5925925925926, array([[75,  5,  1],
        [ 2, 30,  1],
        [ 4,  3, 95]]))

In [0]:
# evaluate_accuracy(model) # seed-0

classification report:               precision    recall  f1-score   support

           0       0.94      0.89      0.91        81
           1       0.73      0.91      0.81        33
           2       0.99      0.95      0.97       102

    accuracy                           0.92       216
   macro avg       0.89      0.92      0.90       216
weighted avg       0.93      0.92      0.92       216



(92.12962962962963, array([[72,  9,  0],
        [ 2, 30,  1],
        [ 3,  2, 97]]))

In [0]:
#evaluate_accuracy(model) # seed-42

classification report:               precision    recall  f1-score   support

           0       0.88      0.93      0.90        81
           1       0.82      0.82      0.82        33
           2       0.99      0.95      0.97       102

    accuracy                           0.92       216
   macro avg       0.90      0.90      0.90       216
weighted avg       0.92      0.92      0.92       216



(92.12962962962963, array([[75,  6,  0],
        [ 5, 27,  1],
        [ 5,  0, 97]]))

In [0]:
#evaluate_accuracy(model) # seed-7

classification report:               precision    recall  f1-score   support

           0       0.90      0.85      0.87        81
           1       0.70      0.85      0.77        33
           2       0.98      0.95      0.97       102

    accuracy                           0.90       216
   macro avg       0.86      0.88      0.87       216
weighted avg       0.91      0.90      0.90       216



(89.81481481481481, array([[69, 12,  0],
        [ 3, 28,  2],
        [ 5,  0, 97]]))

In [0]:
#SVC Model used before
''' test accuracy 0.8287037037037037
train accuracy 0.9825986078886311
              precision    recall  f1-score   support

     <Agent>       0.85      0.76      0.80        93
  <Customer>       0.76      0.89      0.82        80
       <IVR>       0.97      0.86      0.91        43

   micro avg       0.83      0.83      0.83       216
   macro avg       0.86      0.84      0.84       216
weighted avg       0.84      0.83      0.83       216

Confusion matrix
[[71 22  0]
 [ 8 71  1]
 [ 5  1 37]] '''

In [0]:
label_to_ix = {'<Agent>': 0, '<Customer>': 1, '<IVR>': 2}

In [0]:
label_to_ix.keys(), label_to_ix.values()

(dict_keys(['<Agent>', '<Customer>', '<IVR>']), dict_values([0, 1, 2]))

**Predictions**

In [0]:
  def predict(text, language = 'en'):
      model.eval()
      features = prepare_features(text, zero_pad = True)
      ids = torch.tensor(features['input_ids']).unsqueeze(0)
      tokens = torch.tensor(features['token_type_ids']).unsqueeze(0)
      if torch.cuda.is_available():
          ids = ids.cuda()
          tokens = tokens.cuda()
      if model_type == 'distilbert':
        logits_out = model.forward(ids)[0].squeeze(0)

      softmax_out = F.softmax(logits_out, dim=0)
      _, pred_label = torch.max(softmax_out.data, 0)

      prediction=list(label_to_ix.keys())[list(label_to_ix.values()).index(pred_label.data.cpu())]
 
      return prediction

In [0]:
# predictions
sent1 = 'welcome to automatic feedback system'
predict(sent1)


'<IVR>'

**Model and Hyper parameters **

Model | LR | Batch Size | Epochs | Accuracy | Hiddden Layers | Training Time
:---: | :---: | :---: | :---:  | :---: | :---: | :---:
DistilBERT | 1e-06 | 8 | 10 |   % |  2  |   min 
DistilBERT | 5e-06   | 8 | 10 |   % |  2  |  - 
DistilBERT | 5e-06   | 8 | 20 |   % |  2  |  - 

In [0]:
x = ('I have just met with an accident and I need the car repaired immediately.',
     'I am sorry to hear about the accident. Hope you are OK.',
     'I am going on a road trip with my family next week and I need my car fixed before then.',
     'There are a few steps which you need to complete before I can take a claim request.',
     'You don\'t understand, I am going on a vacation next week and my car needs to be ready by then.',
     'please understand sir but we have to follow the process laid by our company. It will take us atleast 3 working days for giving you an approval.',
     'I took the insurance to help me when I need it. I want this approved today',
     'so that the garage can take two days to fix and then I can leave for my vacation next week.',
     'welcome to reliance',
     'our executives will call you back within fifteen minutes',
     'sir right now i am transferring your call ok',

    )

for a in x:
    print(a)
    if predict(a)=='<Customer>':
        print('Customer')
    
    elif predict(a)=='<Agent>':
        print('Agent')

    else:
        print('<IVR>')

I have just met with an accident and I need the car repaired immediately.
Customer
I am sorry to hear about the accident. Hope you are OK.
Customer
I am going on a road trip with my family next week and I need my car fixed before then.
Customer
There are a few steps which you need to complete before I can take a claim request.
Agent
You don't understand, I am going on a vacation next week and my car needs to be ready by then.
Customer
please understand sir but we have to follow the process laid by our company. It will take us atleast 3 working days for giving you an approval.
Agent
I took the insurance to help me when I need it. I want this approved today
Customer
so that the garage can take two days to fix and then I can leave for my vacation next week.
Customer
welcome to reliance
Agent
our executives will call you back within fifteen minutes
<IVR>
sir right now i am transferring your call ok
Agent


In [0]:
predict(x[1])

'<Customer>'

In [0]:
     '''     if i % 200 == 0:
              correct = 0
              total = 0
              for (ids, tokens, labels) in testing_loader:
                  if torch.cuda.is_available():
                      ids = ids.cuda()
                      tokens = tokens.cuda()
                      labels = labels.cuda()
 
                  if model_type == 'distilbert':
                      output = model.forward(ids)[0]

                  _, predicted = torch.max(output.data, 1)
                  total += labels.size(0)
                  correct += (predicted.cpu() == labels.cpu()).sum()
              accuracy = 100.00 * correct.numpy() / total
              print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))'''

"     if i % 200 == 0:\n         correct = 0\n         total = 0\n         for (ids, tokens, labels) in testing_loader:\n             if torch.cuda.is_available():\n                 ids = ids.cuda()\n                 tokens = tokens.cuda()\n                 labels = labels.cuda()\n \n             if model_type == 'distilbert':\n                 output = model.forward(ids)[0]\n\n             _, predicted = torch.max(output.data, 1)\n             total += labels.size(0)\n             correct += (predicted.cpu() == labels.cpu()).sum()\n         accuracy = 100.00 * correct.numpy() / total\n         print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))"

In [0]:
''' def evaluate_accuracy(model):
  correct = 0
  total = 0
  num_classes = 3
  confusion_matrix = torch.zeros(num_classes, num_classes)
  for (ids, tokens, labels) in testing_loader:
      if torch.cuda.is_available():
          ids = ids.cuda()
          tokens = tokens.cuda()
          labels = labels.cuda()

      if model_type == 'distilbert':
          output = model.forward(ids)[0]

      _, predicted = torch.max(output.data, 1)

      for t, p in zip(labels.view(-1), predicted.view(-1)):
        confusion_matrix[t.long(), p.long()] += 1

      total += labels.size(0)
      correct += (predicted.cpu() == labels.cpu()).sum()
  accuracy = 100.00 * correct.numpy() / total
  print('accuracy on test set:', accuracy)
  return confusion_matrix '''