In [59]:
import pandas as pd
import os
import re
import openai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import xgboost as xgb
import time
import contractions
import nltk
import pandas as pd
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging
import torch
import torch.nn.functional as F
import transformers.utils.logging
from torch import nn
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer

logging.basicConfig(level=logging.ERROR)
transformers.utils.logging.set_verbosity_error()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
# Read the call data from the CSV file
call_data = pd.read_csv('../../data/raw/call_data.csv')

transcripts = {}

# Iterate over the files in the transcripts directory
for filename in os.listdir('../../data/raw/transcripts'):
    # Get the SID from the filename
    sid = os.path.splitext(filename)[0]
    
    with open(f'../../data/raw/transcripts/{filename}', 'r') as f:
        lines = f.readlines()
        
    lines = [re.sub(r'^\d+.\s', '', line.strip()) for line in lines]
    transcripts[sid] = lines

# Add the transcript data as a column to the call data
call_data['transcript'] = call_data['SID'].map(transcripts)

In [47]:
call_data.head()

Unnamed: 0,SID,Had Timing Objection,Timing Objection Index,transcript
0,CA77596bc061516d795f6a60fbd274cb0e,False,,"[[Prospect] Hello?, [Sales Rep] Hey, John. H..."
1,CAd8395ea9fec545909e633bba6a8eb643,False,,"[[Prospect] Hello?, [Sales Rep] Hey, Ivan. S..."
2,CAf15429ca373443cd6a6a88573fe16f98,True,9.0,"[[Prospect] Hello?, [Sales Rep] Drake, this ..."
3,CA631c8faf6571f057e34bc12073da9f9c,False,,"[[Prospect] Hello?, [Sales Rep] File perform..."
4,CAbb4454527ef392d377ffd37a5bb00669,True,35.0,"[[Prospect] Hello?, [Sales Rep] Hey, Sean. I..."


In [52]:
call_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   SID                     198 non-null    object
 1   Had Timing Objection    198 non-null    bool  
 2   Timing Objection Index  198 non-null    object
 3   transcript              198 non-null    object
 4   flattened_transcript    198 non-null    object
 5   cleaned_transcript      198 non-null    object
dtypes: bool(1), object(5)
memory usage: 8.1+ KB


In [53]:
# distribution of the target variable
class_distribution = call_data['Had Timing Objection'].value_counts()
class_distribution

False    164
True      34
Name: Had Timing Objection, dtype: int64

# Cleaning

In [54]:
# necessary package downloads
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)

stopwords = [
    "a",
    "an",
    "and",
    "are",
    "as",
    "at",
    "be",
    "by",
    "can",
    "could",
    "did",
    "do",
    "does",
    "for",
    "from",
    "had",
    "has",
    "have",
    "he",
    "hence",
    "her",
    "hers",
    "him",
    "his",
    "if",
    "in",
    "is",
    "it",
    "its",
    # "may",
    # "might",
    # "must",
    "of",
    "on",
    "or",
    "shall",
    "should",
    "since",
    "so",
    "some",
    "such",
    "that",
    "the",
    "their",
    "them",
    "then",
    "there",
    "these",
    "they",
    "this",
    "those",
    "to",
    "was",
    "we",
    "were",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "whose",
    "will",
    "with",
    "would",
    "yet",
    "you",
    "your",
    "yours",
    "about",
    "above",
    "across",
    "after",
    "against",
    "along",
    "among",
    "around",
    "before",
    "behind",
    "below",
    "beneath",
    "beside",
    "between",
    "beyond",
    "during",
    "inside",
    "into",
    "near",
    "outside",
    "over",
    "through",
    "under",
    "upon",
    "within",
    "without",
    "been",
    "having",
    "once",
    "other",
    "until",
    "more",
    "less",
    "own",
    "also",
    "each",
    "every",
    "any",
    "all",
    "some",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "many",
    "several",
    "few",
    "less",
    "more",
    "most",
    "several",
    "how",
    "anyway",
    "however",
    "just",
    "quite",
    "i",
]

stopwords = list(stopwords)

In [55]:
def clean_sentence(sentence, stop_words):
    """
    Clean a single string by removing tags, resolving contractions, removing digits and special characters,
    tokenizing, changing to lower case and removing punctuations, removing stop words, finding the POS tag
    for each token, and lemmatizing each token.

    Parameters:
        sentence (str):A single string to be cleaned.

        stop_words (list): A list of stop words to be removed from the string.

    Returns:
        str: The cleaned string.
    """
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith("J"):
            return wordnet.ADJ
        elif nltk_tag.startswith("V"):
            return wordnet.VERB
        elif nltk_tag.startswith("N"):
            return wordnet.NOUN
        elif nltk_tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    sentence = re.sub(r"<.*?>|Length::\d+:\d+Mins", " ", sentence)
    sentence = contractions.fix(sentence)
    sentence = re.sub(r"[^a-zA-Z\s]", " ", sentence)
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    pos_tagged = nltk.pos_tag(words)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    return lemmatized_sentence

def truncate_to_512(sentence):
    words = word_tokenize(sentence)
    pos_tagged = nltk.pos_tag(words)
    nouns_adjectives = [word for word, tag in pos_tagged if tag.startswith("N") or tag.startswith("J")]
    remaining = [word for word in words if word not in nouns_adjectives]
    if len(nouns_adjectives) <= 512:
        combined = nouns_adjectives + remaining[: 512 - len(nouns_adjectives)]
    else:
        combined = nouns_adjectives[:512]
    return " ".join(combined)

def preprocess_text_from_list(text_list):
    # Join the list elements into a single string
    combined_text = ' '.join(text_list)
    # Remove special characters and numbers
    combined_text = re.sub(r'\W+|\d+', ' ', combined_text)
    return combined_text
    
call_data['flattened_transcript'] = call_data['transcript'].apply(preprocess_text_from_list)
call_data['cleaned_transcript'] = call_data['flattened_transcript'].apply(lambda x: clean_sentence(x, stopwords))
#call_data['truncated_transcript'] = call_data['cleaned_transcript'].apply(truncate_to_512)

call_data.head()

Unnamed: 0,SID,Had Timing Objection,Timing Objection Index,transcript,flattened_transcript,cleaned_transcript
0,CA77596bc061516d795f6a60fbd274cb0e,False,,"[[Prospect] Hello?, [Sales Rep] Hey, John. H...",Prospect Hello Sales Rep Hey John Hey John It...,prospect hello sale rep hey john hey john s s ...
1,CAd8395ea9fec545909e633bba6a8eb643,False,,"[[Prospect] Hello?, [Sales Rep] Hey, Ivan. S...",Prospect Hello Sales Rep Hey Ivan Skyler with...,prospect hello sale rep hey ivan skyler nook h...
2,CAf15429ca373443cd6a6a88573fe16f98,True,9.0,"[[Prospect] Hello?, [Sales Rep] Drake, this ...",Prospect Hello Sales Rep Drake this is Josh w...,prospect hello sale rep drake josh nook s s tu...
3,CA631c8faf6571f057e34bc12073da9f9c,False,,"[[Prospect] Hello?, [Sales Rep] File perform...",Prospect Hello Sales Rep File performance Hey...,prospect hello sale rep file performance hey a...
4,CAbb4454527ef392d377ffd37a5bb00669,True,35.0,"[[Prospect] Hello?, [Sales Rep] Hey, Sean. I...",Prospect Hello Sales Rep Hey Sean It s Patric...,prospect hello sale rep hey sean s patrick cal...


# Feature engineering

In [42]:
def compound_polarity_score(sentence):
    """
    Calculate the compound polarity score of a sentence using VaderSentiment.

    Parameters
    ----------
    sentence : str
        The sentence to be analyzed.

    Returns
    -------
    float
        The compound polarity score of the sentence.
    """
    sid_obj = SentimentIntensityAnalyzer()
    score = sid_obj.polarity_scores(sentence)["compound"]
    return score


def get_subjectivity(sentence):
    """
    Calculate the subjectivity score of a sentence using TextBlob.

    Parameters
    ----------
    sentence : str
        The sentence to be analyzed.

    Returns
    -------
    float
        The subjectivity score of the sentence.
    """
    return round(TextBlob(sentence).sentiment.subjectivity, 6)


def count_pos_neg_neutral(sentence):
    """
    Count the number of positive, negative, and neutral words in a sentence using VaderSentiment.

    Parameters
    ----------
    sentence : str
        The sentence to be analyzed.

    Returns
    -------
    list
        A list of three integers representing the count of positive, negative, and neutral words in the sentence.
    """

    text_split = sentence.split()
    sid = SentimentIntensityAnalyzer()
    pos_word_list = []
    neu_word_list = []
    neg_word_list = []

    for word in text_split:
        if (sid.polarity_scores(word)["compound"]) >= 0.5:
            pos_word_list.append(word)
        elif (sid.polarity_scores(word)["compound"]) <= -0.5:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)
    return [len(pos_word_list), len(neg_word_list), len(neu_word_list)]

    # Define function to count number of lowercase


def count_lower(sentence):
    """
    Count the number of lowercase words in a sentence.

    Parameters
    ----------
    sentence : str
        The sentence to be analyzed.

    Returns
    -------
    int
        The number of lowercase words in the sentence.
    """
    words = nltk.word_tokenize(sentence)
    count = 0
    for word in words:
        if not word.isupper():  # eg Real is considered lowercase
            count += 1
    return count


# Define function to count number of uppercase
def count_upper(sentence):
    """
    Count the number of uppercase words in a sentence.

    Parameters
    ----------
    sentence : str
        The sentence to be analyzed.

    Returns
    -------
    int
        The number of uppercase words in the sentence.
    """
    words = nltk.word_tokenize(sentence)
    count = 0
    for word in words:
        if word.isupper():
            if len(word) > 1:  # exclude one letter words eg 'I'
                count += 1
    return count


# Define function to list uppercase words
def uppercase_list(sentence):
    """
    Get a list of uppercase words in a sentence.

    Parameters
    ----------
    sentence : str
        Input sentence.

    Returns
    -------
    str
        A comma-separated string of uppercase words in the sentence.
    """
    words = nltk.word_tokenize(sentence)
    uppercase = []
    for word in words:
        if word.isupper():
            if len(word) > 1:  # exclude one letter words eg 'I'
                uppercase.append(word)
    uppercase = ", ".join(uppercase)
    return uppercase


# Define function to get uppercase:total tokens ratio
def uppercase_ratio(sentence):
    """
    Get the ratio of uppercase words to total tokens in a sentence.

    Parameters
    ----------
    sentence : str
        Input sentence.

    Returns
    -------
    float
        The ratio of uppercase words to total tokens, rounded to 6 decimal places.
    """
    words = nltk.word_tokenize(sentence)
    count = 0
    for word in words:
        if word.isupper():
            if len(word) > 1:  # exclude 'I'
                count += 1
    ratio = count / len(words)
    ratio = round(ratio, 6)
    return ratio


# Define function to count number of punctuations
def count_punc(sentence):
    """
    Count the number of punctuations in a sentence.

    Parameters
    ----------
    sentence : str
        Input sentence.

    Returns
    -------
    int
        The number of punctuations in the sentence.
    """
    words = nltk.word_tokenize(sentence)
    count = 0
    for word in words:
        if word in string.punctuation:
            count += 1
    return count


def pos_tags(sentence):
    """
    Get the part-of-speech (POS) tags of the words in a sentence.

    Parameters
    ----------
    sentence : str
        Input sentence.

    Returns
    -------
    list of tuples
        A list of tuples, where each tuple contains a word and its corresponding POS tag.
    """
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    tagged = nltk.pos_tag(tokenized_sentence)
    return tagged

In [56]:
# Apply the feature engineering functions to your DataFrame
call_data['compound_polarity'] = call_data['flattened_transcript'].apply(compound_polarity_score)
call_data['subjectivity'] = call_data['flattened_transcript'].apply(get_subjectivity)
call_data[['num_positive', 'num_negative', 'num_neutral']] = call_data['flattened_transcript'].apply(count_pos_neg_neutral).apply(pd.Series)
call_data['lowercase_count'] = call_data['flattened_transcript'].apply(count_lower)
call_data['uppercase_count'] = call_data['flattened_transcript'].apply(count_upper)
call_data['pos_tag'] = call_data['flattened_transcript'].apply(pos_tags)


In [57]:
call_data.head()

Unnamed: 0,SID,Had Timing Objection,Timing Objection Index,transcript,flattened_transcript,cleaned_transcript,compound_polarity,subjectivity,num_positive,num_negative,num_neutral,lowercase_count,uppercase_count,pos_tag
0,CA77596bc061516d795f6a60fbd274cb0e,False,,"[[Prospect] Hello?, [Sales Rep] Hey, John. H...",Prospect Hello Sales Rep Hey John Hey John It...,prospect hello sale rep hey john hey john s s ...,0.9999,0.501447,9,0,1605,1555,9,"[(prospect, JJ), (hello, NN), (sales, NNS), (r..."
1,CAd8395ea9fec545909e633bba6a8eb643,False,,"[[Prospect] Hello?, [Sales Rep] Hey, Ivan. S...",Prospect Hello Sales Rep Hey Ivan Skyler with...,prospect hello sale rep hey ivan skyler nook h...,0.9999,0.525811,8,3,1575,1476,13,"[(prospect, JJ), (hello, NN), (sales, NNS), (r..."
2,CAf15429ca373443cd6a6a88573fe16f98,True,9.0,"[[Prospect] Hello?, [Sales Rep] Drake, this ...",Prospect Hello Sales Rep Drake this is Josh w...,prospect hello sale rep drake josh nook s s tu...,0.9996,0.611293,5,1,586,555,0,"[(prospect, JJ), (hello, NN), (sales, NNS), (r..."
3,CA631c8faf6571f057e34bc12073da9f9c,False,,"[[Prospect] Hello?, [Sales Rep] File perform...",Prospect Hello Sales Rep File performance Hey...,prospect hello sale rep file performance hey a...,0.992,0.517547,3,2,277,284,0,"[(prospect, JJ), (hello, NN), (sales, NNS), (r..."
4,CAbb4454527ef392d377ffd37a5bb00669,True,35.0,"[[Prospect] Hello?, [Sales Rep] Hey, Sean. I...",Prospect Hello Sales Rep Hey Sean It s Patric...,prospect hello sale rep hey sean s patrick cal...,0.9999,0.490979,18,0,1955,1900,6,"[(prospect, JJ), (hello, NN), (sales, NNS), (r..."


# Modelling

In [60]:
X = call_data.drop(["Had Timing Objection"], axis=1)
y = call_data['Had Timing Objection']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=5000) # Limiting to 5000 features for simplicity and speed

X_train_bow = tfidf.fit_transform(X_train['cleaned_transcript'])
X_train_bow = pd.DataFrame(X_train_bow.toarray(), columns=self.vectorizer.get_feature_names_out())

X_train_clean = X_train.drop(["flattened_transcript", "cleaned_transcript",], axis=1)
X_train_concat = pd.concat([X_train_clean, X_train_bow], axis=1)
X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_concat)

KeyError: 'clean_transcript'

In [None]:
X_train_scaled

## Baseline model using tf-idf and log reg

In [22]:
# log reg with class weights as data is imbalanced
log_reg_balanced = LogisticRegression(class_weight='balanced')

# train
log_reg_balanced.fit(X_train, y_train)

# predict
start_time_balanced = time.time()
y_pred_balanced = log_reg_balanced.predict(X_test)
end_time_balanced = time.time()

# time taken for prediction
prediction_time_balanced = end_time_balanced - start_time_balanced

# precision and recall
precision_balanced = precision_score(y_test, y_pred_balanced)
recall_balanced = recall_score(y_test, y_pred_balanced)

precision_balanced, recall_balanced, prediction_time_balanced

(0.25, 1.0, 0.00019407272338867188)

### Log reg with grid search 

In [24]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # regularization strength
    'solver': ['liblinear', 'newton-cg', 'lbfgs']  # optimizationa algo
}

grid_search = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, 
                           scoring='precision', cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

log_reg_tuned = LogisticRegression(class_weight='balanced', **best_params)
log_reg_tuned.fit(X_train, y_train)

start_time_tuned = time.time()
y_pred_tuned = log_reg_tuned.predict(X_test)
end_time_tuned = time.time()

prediction_time_tuned = end_time_tuned - start_time_tuned

precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)

best_params, precision_tuned, recall_tuned, prediction_time_tuned

({'C': 100, 'solver': 'liblinear'},
 0.2,
 0.3333333333333333,
 0.0001049041748046875)

## SVM

In [26]:
# SVM
svm_model = SVC(class_weight='balanced', probability=True)

# Train
svm_model.fit(X_train, y_train)

# Predict
start_time_svm = time.time()
y_pred_svm = svm_model.predict(X_test)
end_time_svm = time.time()

# time taken for prediction
prediction_time_svm = end_time_svm - start_time_svm

# precision and recall
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)

precision_svm, recall_svm, prediction_time_svm

(0.2222222222222222, 0.6666666666666666, 0.004116058349609375)

## Using Ngrams

In [32]:
tfidf_ngram = TfidfVectorizer(max_features=5000, ngram_range=(1, 3)) # unigrams, bigrams, and trigrams

X_ngram = tfidf_ngram.fit_transform(call_data['clean_transcript'])

X_train_ngram, X_test_ngram, y_train, y_test = train_test_split(X_ngram, y, test_size=0.2, random_state=42)

log_reg_ngram = LogisticRegression(class_weight='balanced')
log_reg_ngram.fit(X_train_ngram, y_train)

svm_model_ngram = SVC(class_weight='balanced', probability=True)
svm_model_ngram.fit(X_train_ngram, y_train)

y_pred_log_reg_ngram = log_reg_ngram.predict(X_test_ngram)
precision_log_reg_ngram = precision_score(y_test, y_pred_log_reg_ngram)
recall_log_reg_ngram = recall_score(y_test, y_pred_log_reg_ngram)

y_pred_svm_ngram = svm_model_ngram.predict(X_test_ngram)
precision_svm_ngram = precision_score(y_test, y_pred_svm_ngram)
recall_svm_ngram = recall_score(y_test, y_pred_svm_ngram)

(precision_log_reg_ngram, recall_log_reg_ngram), (precision_svm_ngram, recall_svm_ngram)

((0.18181818181818182, 0.6666666666666666), (0.25, 0.6666666666666666))

## XGB

In [31]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=(len(y)-sum(y))/sum(y))

# Train the XGBoost model
xgb_model.fit(X_train_ngram, y_train)

# Predict on the test set with the XGBoost model
start_time_xgb = time.time()
y_pred_xgb = xgb_model.predict(X_test_ngram)
end_time_xgb = time.time()

# Calculate the time taken for prediction
prediction_time_xgb = end_time_xgb - start_time_xgb

# Calculate precision and recall for the XGBoost model
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)

precision_xgb, recall_xgb, prediction_time_xgb

(0.5, 0.6666666666666666, 0.0009341239929199219)