# Import Packages

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from src.data.feature_engineering import FeatureEngineer
from src.data.preprocess import Preprocessor
# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Load Saved Model
from transformers import BertForSequenceClassification, BertConfig

# Misc.
import warnings
warnings.filterwarnings('ignore')
from transformers import logging
logging.set_verbosity_error()

In [2]:
# Set intial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preprocessing

## Tokeniser

In [3]:
# Set the model name
MODEL_NAME = 'bert-base-cased'

# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [4]:
# Some of the common BERT tokens
print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set 

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


Since BERT works with fixed length sequences, we need to store the length of each review and determine the max length of each token.

In [5]:
# Read the call data from the CSV file
df = pd.read_csv('../../data/raw/call_data.csv')

transcripts = {}

# Iterate over the files in the transcripts directory
for filename in os.listdir('../../data/raw/transcripts'):
    # Get the SID from the filename
    sid = os.path.splitext(filename)[0]
    
    with open(f'../../data/raw/transcripts/{filename}', 'r') as f:
        lines = f.readlines()
        
    lines = [re.sub(r'^\d+.\s', '', line.strip()) for line in lines]
    transcripts[sid] = lines
# Add the transcript data as a column to the call data
df['transcript'] = df['SID'].map(transcripts)
df

Unnamed: 0,SID,Had Timing Objection,Timing Objection Index,transcript
0,CA77596bc061516d795f6a60fbd274cb0e,False,,"[[Prospect] Hello?, [Sales Rep] Hey, John. H..."
1,CAd8395ea9fec545909e633bba6a8eb643,False,,"[[Prospect] Hello?, [Sales Rep] Hey, Ivan. S..."
2,CAf15429ca373443cd6a6a88573fe16f98,True,9,"[[Prospect] Hello?, [Sales Rep] Drake, this ..."
3,CA631c8faf6571f057e34bc12073da9f9c,False,,"[[Prospect] Hello?, [Sales Rep] File perform..."
4,CAbb4454527ef392d377ffd37a5bb00669,True,35,"[[Prospect] Hello?, [Sales Rep] Hey, Sean. I..."
...,...,...,...,...
193,CAb5a575591cfbaf24ac936657c5befd47,False,,"[[Sales Rep] Hey, Mark. Skyler with NUCs. I w..."
194,CA84e33cc6a9aac5c8ecf41fe25e8be7ce,True,9,"[[Prospect] Hello?, [Sales Rep] Maddie, hey...."
195,CAbfde17aa013e9896ece8102c8813f700,False,,"[[Sales Rep] Hello?, [Prospect] Hello?]"
196,CA16f2e7ff1215c1b52a509abd7c373732,False,,"[[Prospect] Hi. This is Brandon, [Sales Rep] ..."


In [7]:
df.transcript

0      [[Prospect]  Hello?, [Sales Rep]  Hey, John. H...
1      [[Prospect]  Hello?, [Sales Rep]  Hey, Ivan. S...
2      [[Prospect]  Hello?, [Sales Rep]  Drake, this ...
3      [[Prospect]  Hello?, [Sales Rep]  File perform...
4      [[Prospect]  Hello?, [Sales Rep]  Hey, Sean. I...
                             ...                        
193    [[Sales Rep]  Hey, Mark. Skyler with NUCs. I w...
194    [[Prospect]  Hello?, [Sales Rep]  Maddie, hey....
195            [[Sales Rep]  Hello?, [Prospect]  Hello?]
196    [[Prospect]  Hi. This is Brandon, [Sales Rep] ...
197    [[Prospect]  Paul speaking., [Sales Rep]  Hey,...
Name: transcript, Length: 198, dtype: object

In [6]:
preprocessor = Preprocessor(df)
preprocessor.clean_csv()
pre_processed_df = preprocessor.clean_df

TypeError: expected string or bytes-like object

In [None]:
MAX_LEN = 300

## Torch Dataset Preparation

In [None]:
class GPReviewDataset(Dataset):
    # Constructor Function 
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    # Length magic method
    def __len__(self):
        return len(self.reviews)
    
    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        # Encoded format to be returned 
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Split data into training, test, validation sets

In [None]:
# df_train , df_test = train_test_split(df,
#                                         test_size = 0.2,
#                                         random_state = 4263)
#
# df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=4263)
#
# print(df_train.shape, df_val.shape, df_test.shape)

In [None]:
# Read dataset from training pipeline
df_train = pd.read_csv("../../data/processed/train_final_processed_reviews.csv", index_col="Unnamed: 0")
df_test = pd.read_csv("../../data/processed/test_final_processed_reviews.csv", index_col="Unnamed: 0")
df_val = df_test.copy()

In [None]:
df_val

## Data Loader to release data in batches

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.text.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [None]:
# Create train, test and val data loaders
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# Examples 
data = next(iter(train_data_loader))
print(data.keys())

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

# Sentiment Classification with BERT and Hugging Face

In [None]:
# Load the basic BERT model 
bert_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
# Build the Sentiment Classifier class 
class SentimentClassifier(nn.Module):
    
    # Constructor class 
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    # Forward propagaion class
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
            , return_dict = False
        )
        #  Add a dropout layer 
        output = self.drop(pooled_output)
        return self.out(output)

We use a dropout layer for some regularization and a fully-connected layer for our output. We are returning the raw output of the last layer since that is required for the cross-entropy loss function in PyTorch to work. Create an instance and move it to the GPU

In [None]:
# Instantiate the model and move to classifier
# 1: positive, 0: negative
class_names = ['0','1']
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [None]:
# Number of hidden units
print(bert_model.config.hidden_size)

## Training Phase

AdamW optimizer provided by Hugging Face is used. It corrects weight decay. We’ll also use a linear scheduler with no warmup.

In [None]:
# Number of iterations 
EPOCHS = 4

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            # , return_dict = False
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

## Model Evaluation Function

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                # , return_dict = False
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

## Training Loop

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

In [None]:
# Plot training and validation accuracy
# plt.plot(history['train_acc'], label='train accuracy')
# plt.plot(history['val_acc'], label='validation accuracy')

plt.plot(list(map(lambda x: x.cpu(),history['train_acc'])), label='train accuracy')
plt.plot(list(map(lambda x: x.cpu(),history['val_acc'])), label='validation accuracy')

# Graph chars
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Model Evaluation (For Training)

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

## Predictions (store text + predicted probabilities)

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)

In [None]:
class_names = ['0','1']

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

# Load Saved Model

In [None]:
# Load Saved Model
# from transformers import BertForSequenceClassification, BertConfig
# MODEL_NAME = "bert-base-cased"
# config = BertConfig.from_pretrained(MODEL_NAME)
# model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
# model = model.to(device)
# model.load_state_dict(torch.load("best_model_state.bin"),strict=False)

In [None]:
PATH = 'bert_state_dict_new_clean.pt'

In [None]:
# save trained bert model
torch.save(model.state_dict(), PATH)

In [None]:
# load trained bert model
saved_model = SentimentClassifier(len(class_names))
saved_model.load_state_dict(torch.load(PATH, map_location=device))
saved_model = saved_model.to(device)

In [None]:
test_acc, _ = eval_model(
  saved_model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

# Model Evaluation (For saved model)

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    saved_model,
    test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

# Prediction on raw text

In [None]:
review_text = "I  like the dog food I bought,as it is healthy!"

In [None]:
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')