This is a slightly edited version of the kindly shared code from https://github.com/amunategui/Chatbot-Conversations/blob/master/Chatbot-Conversations.ipynb which is discussed in some detail in https://www.springml.com/blog/building-intents-customer-service-transcripts/.

The objective is to replicate some of these techniques in a later version in R, with parallelization, and compare results.

In [1]:
import numpy as np
import pandas as pd
import sys, os, re, itertools, collections, string, time
from io import BytesIO
from collections import Counter
from time import time
import datetime

In [2]:
# https://catalog.data.gov/dataset/consumer-complaint-database 
complaints_df_raw = pd.read_csv("complaints.csv", 
                usecols=('Product','Consumer complaint narrative', 'Sub-issue'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
complaints_df_raw=complaints_df_raw[complaints_df_raw['Consumer complaint narrative'].notnull()]
complaints_df_raw=complaints_df_raw[complaints_df_raw['Product'].notnull()]

# remove XXXX from narratives
complaints_df_raw['Consumer complaint narrative'] =  complaints_df_raw['Consumer complaint narrative'].replace({'X':''}, regex=True)

# always seed your random generators for reporducilibity 
complaints_df_raw = complaints_df_raw.sample(200000, replace=False, random_state=1)

# basic sentence prep
# set to lower
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.lower()
# remove special characters
complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.replace('\W', ' ')

# remove elements with no text
complaints_df_raw= complaints_df_raw[complaints_df_raw['Consumer complaint narrative'] != '']

# any dups
complaints_df_raw = complaints_df_raw.drop_duplicates(subset=['Consumer complaint narrative'])

  complaints_df_raw['Consumer complaint narrative'] = complaints_df_raw['Consumer complaint narrative'].str.replace('\W', ' ')


In [3]:
complaints_df_raw.head()

Unnamed: 0,Product,Sub-issue,Consumer complaint narrative
430854,"Credit reporting, credit repair services, or o...",Their investigation did not fix an error on yo...,experian is reporting incorrectly that i am 15...
200159,Payday loan,,on i received an alert that i have a new c...
96476,"Credit reporting, credit repair services, or o...",Account status incorrect,transunion llc pa re letter to remov...
2150206,Debt collection,Didn't receive notice of right to dispute,thank you for your recent communication wherei...
759119,Credit reporting,Information is not mine,i am disputing the hard inquiries that are on...


# Clean Up Data 

In [4]:
complaints_df = complaints_df_raw.copy()

In [5]:
word_similarity=complaints_df['Consumer complaint narrative'].str.split(' ').map(Counter)
word_similarity_ratio = []
complaints_df.shape 

(188091, 3)

In [6]:
for wu in word_similarity:
    word_similarity_ratio.append(np.sum([x[1] for x in wu.items()])/np.float(len(wu)))
    
complaints_df['narrative_similarity_ratio'] = word_similarity_ratio
complaints_df['narrative_similarity_ratio'].describe()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  word_similarity_ratio.append(np.sum([x[1] for x in wu.items()])/np.float(len(wu)))


count    188091.000000
mean          2.125075
std           0.945147
min           1.000000
25%           1.621951
50%           1.972477
75%           2.420455
max          52.850000
Name: narrative_similarity_ratio, dtype: float64

In [7]:
# thin out some entries that contain too much duplicated lines within
complaints_df = complaints_df[complaints_df['narrative_similarity_ratio'] <= 1.7]
complaints_df.reset_index(drop=True,inplace=True)
complaints_df.shape

(57501, 4)

In [8]:
list(complaints_df['Consumer complaint narrative'])[0:4]

['was put on a nine month rehab program  completed it and was charged   13000 00  additional interest along with a   6000 00  fee after completion and they kept taking off the once one driven repayment plan before the renewal one year time was up and was not allowed to regain it along with being told call the deparrment of education to ask them to take off the additional interest  asked numerous times to send me the forms in the mail to reinstate for income driven repayment plan and never received anything  then started getting 10 robo calls a day from unknown numbers along with navient     calling and harrassing me with nonstop daily calls leaving message or voicemail with someone talking so fast noone could understand  this is ridiculous and illegal and harassment',
 'all attempts to settle the account and make payment are attached below  the company chrysler capital failed to respond to all document presentments as well as has taken the personal property and has kept the financial i

# Get Key Verbs And Nouns

In [9]:
# find most common verbs and measure coverage 
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en')

# just load what we need to avoid taxing memory
nlp = spacy.load('en_core_web_sm') # you can install 'en_core_web_sm' with this command in Anaconda Prompt (for Windows) - python -m spacy download en_core_web_sm - see https://stackoverflow.com/a/57989297/4856426

In [10]:
# create one big blob of text to process things a bit faster
blob_complaints = ''.join(list(complaints_df['Consumer complaint narrative']))

# Max text of length of 1000000
n = 900000
blog_chunks = [blob_complaints[i:i+n] for i in range(0, len(blob_complaints), n)]
len(blog_chunks)

21

In [11]:
nlp

<spacy.lang.en.English at 0x1d10d147940>

In [12]:
just_verbs = []
just_nouns = []
counter_=len(blog_chunks)
for sentence in blog_chunks:
    counter_ -= 1
    if (counter_ % 10 == 0): print(counter_)
    print(counter_)
    # doc = nlp(sentence.decode('utf-8'))
    doc = nlp(sentence)
    temp_verb = []
    temp_noun = []
    for token in doc: 
        if (token.pos_ == u'VERB'): 
            temp_verb.append(token.text)
        if (token.pos_ == u'NOUN'):
            temp_noun.append(token.text)
            

    # just_verbs.append(' '.join(temp_verb).encode('utf-8'))
    just_verbs.append(' '.join(temp_verb))
    # just_nouns.append(' '.join(temp_noun).encode('utf-8'))
    just_nouns.append(' '.join(temp_noun))

20
20
19
18
17
16
15
14
13
12
11
10
10
9
8
7
6
5
4
3
2
1
0
0


In [13]:
just_verbs[0].split()[0:10]

['put',
 'completed',
 'charged',
 'kept',
 'taking',
 'driven',
 'allowed',
 'regain',
 'told',
 'call']

In [14]:
just_nouns[0].split()[0:10]

['month',
 'interest',
 'fee',
 'completion',
 'repayment',
 'plan',
 'renewal',
 'year',
 'time',
 'deparrment']

In [15]:
print('count just_verbs: %i' % len(just_verbs))
print('count just_nouns: %i' % len(just_nouns))

count just_verbs: 21
count just_nouns: 21


In [16]:
# pickle both objects so you don't have to re-run spacy 
import pickle
pickle_file = "verbs_nouns.p"

overwrite_old_pickle = True
if overwrite_old_pickle:
    with open(pickle_file, "wb") as f:
        pickle.dump([just_verbs, just_nouns], f)
    
# read in saved pickle
with open(pickle_file, "rb") as f:
    backup_pos = pickle.load(f)

In [17]:
all_verbs = backup_pos[0]
len(all_verbs)

# append all verbs together so we can run frequency counts
verbs = []
for verb_set in all_verbs:
    verbs.append(verb_set.split())
    #verbs = [verb for verb in verb_set[0].split()]

len(verbs)
verbs_master = [val for sublist in verbs for val in sublist]
len(verbs_master)

508466

In [18]:
# what is your upper and lower cut offs?
from collections import Counter
verbs_df = pd.DataFrame(Counter([verb for verb in verbs_master]).most_common(), columns = ['verb', 'count'])
verbs_df.head(20)

Unnamed: 0,verb,count
0,have,15375
1,sent,8713
2,received,8108
3,reporting,6916
4,removed,6398
5,paid,6332
6,had,6254
7,get,6140
8,remove,6062
9,called,5429


In [19]:
len(verbs_df[verbs_df['count'] > 1000])
verbs_df = verbs_df[verbs_df['count'] > 1000]
len(verbs_df)

117

# Sorting Out Nouns

In [20]:
all_nouns = backup_pos[1]

# append all verbs together so we can run frequency counts
nouns = []
for noun_set in all_nouns:
    nouns.append(noun_set.split())

nouns_master = [val for sublist in nouns for val in sublist]
len(nouns_master)

715210

In [21]:
# what is your upper and lower cut offs?
from collections import Counter
nouns_df = pd.DataFrame(Counter([noun for noun in nouns_master]).most_common(), columns = ['noun', 'count'])
nouns_df.head()

Unnamed: 0,noun,count
0,credit,48843
1,account,28241
2,report,23484
3,information,15400
4,debt,14560


In [22]:
len(nouns_df[nouns_df['count'] > 1000])
nouns_df = nouns_df[nouns_df['count'] > 1000]
len(nouns_df)

130

### Binarize DataFrame With Official Verb & Noun List

In [23]:
# create new data frame with key verbs and nouns as features
key_words = list(nouns_df['noun']) + list(verbs_df['verb'])
row_bools = []
counter_ = len(complaints_df['Consumer complaint narrative'])
for sentence in complaints_df['Consumer complaint narrative']:
    counter_ -= 1
    if (counter_ % 10000 == 0): print(counter_)
    row_bool = []
    words = sentence.split()
    for kw in key_words:
        row_bool.append(kw in words)
    row_bools.append(row_bool)
    
print('length:', len(row_bools))
row_bools = pd.DataFrame(row_bools, columns=key_words)    
row_bools = row_bools.astype(int)
row_bools.shape


50000
40000
30000
20000
10000
0
length: 57501


(57501, 247)

In [24]:
row_bools.head()

Unnamed: 0,credit,account,report,information,debt,company,accounts,loan,payment,card,...,shows,having,sold,was,sending,certified,according,informed,respond,needs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,0,1,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
3,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0


In [25]:
row_bools.tail()

Unnamed: 0,credit,account,report,information,debt,company,accounts,loan,payment,card,...,shows,having,sold,was,sending,certified,according,informed,respond,needs
57496,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57497,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57498,0,1,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
57499,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57500,1,1,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


### Cluster of popular sentences

In [26]:
from sklearn.cluster import KMeans

TOTAL_CLUSTERS = 50

# Number of clusters
kmeans = KMeans(n_clusters=TOTAL_CLUSTERS)
# Fitting the input data
kmeans = kmeans.fit(row_bools)
# Getting the cluster labels
labels = kmeans.predict(row_bools)

# add cluster back to data frame 
row_bools['cluster'] = labels

row_bools['cluster'].value_counts().head()

0     3501
5     2703
27    2418
41    2240
49    2138
Name: cluster, dtype: int64

In [27]:
row_bools['cluster'].value_counts().head()

0     3501
5     2703
27    2418
41    2240
49    2138
Name: cluster, dtype: int64

In [28]:
# add cluster number back to orginal corpus
complaints_df['Cluster'] = labels
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')
import itertools
from collections import Counter
import nltk
# nltk.download('punkt') # if you get NLTK Resource punkt not found error - https://www.tutorialexample.com/fix-nltk-resource-punkt-not-found-nltk-tutorial/ - only need to download once
from nltk.util import ngrams

unique_complaints_2grams = []
unique_complaints_3grams = []
unique_complaints_4grams = []
unique_complaints_5grams = []
unique_complaints_6grams = []
# loop through each cluster
for cluster_to_search in range(min(row_bools['cluster']), max(row_bools['cluster'])+1):
    # cluster-level research
    print('Cluster: %i' % cluster_to_search)
    df_tmp = complaints_df[complaints_df['Cluster']==cluster_to_search].copy()
    print('data cluster shape: %s' % len(df_tmp))
    
    bigrams = []
    trigrams = []
    fourgrams = []
    fivegrams = []
    sixgrams = []
    
    for index, row in df_tmp.iterrows(): 
        # token = nltk.word_tokenize(row['Consumer complaint narrative'].decode('utf-8'))
        token = nltk.word_tokenize(row['Consumer complaint narrative'])
        bigrams.append([' '.join(pair) for pair in list(ngrams(token,2)) if len(set(pair))==2])
        trigrams.append([' '.join(pair) for pair in list(ngrams(token,3)) if len(set(pair))==3])
        fourgrams.append([' '.join(pair) for pair in list(ngrams(token,4)) if len(set(pair))==4])
        fivegrams.append([' '.join(pair) for pair in list(ngrams(token,5)) if len(set(pair))==5])
        sixgrams.append([' '.join(pair) for pair in list(ngrams(token,6)) if len(set(pair))==6])
        
    bigrams = [val for sublist in bigrams for val in sublist]
    trigrams = [val for sublist in trigrams for val in sublist]
    fourgrams = [val for sublist in fourgrams for val in sublist]
    fivegrams = [val for sublist in fivegrams for val in sublist]
    sixgrams = [val for sublist in sixgrams for val in sublist]
    
    # find top x most popular grams per size
    # 2 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in bigrams]).most_common(50), columns=['bigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_2grams.append(freqx)
    # 3 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in trigrams]).most_common(50), columns=['trigrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_3grams.append(freqx)
    # 4 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_4grams.append(freqx)
    # 5 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in fivegrams]).most_common(50), columns=['fivegrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_5grams.append(freqx)
    # 6 bigrams
    freqx = pd.DataFrame(Counter([noun for noun in sixgrams]).most_common(50), columns=['sixgrams','frequency'])
    freqx['Cluster'] = cluster_to_search
    unique_complaints_6grams.append(freqx)

Cluster: 0
data cluster shape: 3501
Cluster: 1
data cluster shape: 1494
Cluster: 2
data cluster shape: 1494
Cluster: 3
data cluster shape: 805
Cluster: 4
data cluster shape: 1026
Cluster: 5
data cluster shape: 2703
Cluster: 6
data cluster shape: 807
Cluster: 7
data cluster shape: 1106
Cluster: 8
data cluster shape: 654
Cluster: 9
data cluster shape: 77
Cluster: 10
data cluster shape: 1299
Cluster: 11
data cluster shape: 1478
Cluster: 12
data cluster shape: 1149
Cluster: 13
data cluster shape: 1017
Cluster: 14
data cluster shape: 920
Cluster: 15
data cluster shape: 1364
Cluster: 16
data cluster shape: 114
Cluster: 17
data cluster shape: 819
Cluster: 18
data cluster shape: 809
Cluster: 19
data cluster shape: 942
Cluster: 20
data cluster shape: 1198
Cluster: 21
data cluster shape: 401
Cluster: 22
data cluster shape: 1751
Cluster: 23
data cluster shape: 1881
Cluster: 24
data cluster shape: 1375
Cluster: 25
data cluster shape: 131
Cluster: 26
data cluster shape: 37
Cluster: 27
data cluster 

In [29]:
df = pd.concat(unique_complaints_4grams)
# freqx = pd.DataFrame(Counter([noun for noun in fourgrams]).most_common(50), columns=['fourgrams','frequency'])
df = df.drop_duplicates(subset=['fourgrams'], keep=False)
df.head()

Unnamed: 0,fourgrams,frequency,Cluster
10,my credit profile is,14,0
11,credit profile is inaccurate,14,0
12,profile is inaccurate which,14,0
13,inaccurate which is not,14,0
14,which is not fair,14,0


In [30]:
# find top x most popular grams per size
see_grams = 6


if see_grams==2:
    df = pd.concat(unique_complaints_2grams)
    df = df.drop_duplicates(subset=['bigrams'], keep=False)
elif see_grams==3:
    df = pd.concat(unique_complaints_3grams)
    df = df.drop_duplicates(subset=['trigrams'], keep=False)
elif see_grams==4:
    df = pd.concat(unique_complaints_4grams)
    df = df.drop_duplicates(subset=['fourgrams'], keep=False)
elif see_grams==5:
    df = pd.concat(unique_complaints_5grams)
    df = df.drop_duplicates(subset=['fivegrams'], keep=False)
elif see_grams==6:
    df = pd.concat(unique_complaints_6grams)
    df = df.drop_duplicates(subset=['sixgrams'], keep=False)
 
df = df.sort_values('Cluster')
df[df['frequency'] > 10]  

Unnamed: 0,sixgrams,frequency,Cluster
0,my credit profile is inaccurate which,14,0
1,inaccurate which is not fair to,14,0
2,which is not fair to me,14,0
3,is not fair to me please,14,0
4,not fair to me please investigate,14,0
...,...,...,...
19,that i have no knowledge of,12,49
17,from my credit report i have,13,49
14,i have not applied for any,14,49
12,my credit report and i have,15,49


### Tie It Back To Complaint

In [31]:
# tie it back to look into a couple of actual complaints
keywords = "attempting to collect a debt from"
 
for index, row in complaints_df.iterrows():
    txt = row['Consumer complaint narrative'] 
    if (keywords in txt):
        print(txt)
        print('------')

   is attempting to collect a debt from  that was discharged thru bankrutpcy  plus on top of that i never received any information about this debt before it hit my report  i would of told them it was no longer owed  thirdlly the original debt was in my mother  s name   
------
    is attempting to collect a debt from me that is not owed  they have not submitted any validation or proof of me having a writing contract with      and myself 
------
multiple calls between  and current from different numbers from portfolio recovery  told every single time not to call but to contact by mail  portfolio recovery has also called family members and told them they were attempting to collect a debt from me 
------
there is a company attempting to collect a debt from me  but they keep changing the dates as to when the debt was opened in an attempt to never have the debt removed from my credit report  they will pull the debt from my credit and then replace it with a new date that it was opened 
-----

# Jasen's scratchpad

In [32]:
df.Cluster.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
      dtype=int64)