In [None]:
import numpy as np
import pandas as pd
import pickle
import re, nltk, spacy, gensim
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.utils import simple_preprocess
from pprint import pprint
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/cfpb-selected.csv')
df.head()

  and should_run_async(code)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/18/21,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I previously asked the credit bureaus for a in...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,191XX,,Consent provided,Web,03/18/21,Closed with explanation,Yes,,4225218
1,03/23/21,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Investigation took more than 30 days,"XXXX, Experian and XXXX, FHA Resource Center, ...",Company has responded to the consumer and the ...,Experian Information Solutions Inc.,UT,840XX,,Consent provided,Web,03/23/21,Closed with explanation,Yes,,4239496
2,04/07/21,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,On XX/XX/2021 sent a letter regarding inaccura...,,"EQUIFAX, INC.",FL,322XX,,Consent provided,Web,04/07/21,Closed with explanation,Yes,,4278158
3,04/09/21,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,I received an email about a decrease in my con...,,"EQUIFAX, INC.",GA,,,Consent provided,Web,04/09/21,Closed with explanation,Yes,,4283212
4,03/25/21,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,There is a fraudulent bank account that is rep...,Company has responded to the consumer and the ...,"Fidelity National Information Services, Inc. (...",NY,114XX,,Consent provided,Web,03/25/21,Closed with explanation,Yes,,4245653


In [3]:
df['Issue'].value_counts()

  and should_run_async(code)


Incorrect information on your report                                                18175
Problem with a credit reporting company's investigation into an existing problem    14768
Improper use of your report                                                          2871
Attempts to collect debt not owed                                                    2564
Trouble during payment process                                                       2543
Managing an account                                                                  1967
Problem with a purchase shown on your statement                                      1644
Struggling to pay mortgage                                                            882
Closing on a mortgage                                                                 689
Unable to get your credit report or credit score                                      652
Fees or interest                                                                      620
Closing an

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49491 entries, 0 to 49490
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date received                 49491 non-null  object 
 1   Product                       49491 non-null  object 
 2   Sub-product                   49491 non-null  object 
 3   Issue                         49491 non-null  object 
 4   Sub-issue                     49491 non-null  object 
 5   Consumer complaint narrative  49491 non-null  object 
 6   Company public response       49491 non-null  object 
 7   Company                       49491 non-null  object 
 8   State                         49491 non-null  object 
 9   ZIP code                      49491 non-null  object 
 10  Tags                          49491 non-null  object 
 11  Consumer consent provided?    49491 non-null  object 
 12  Submitted via                 49491 non-null  object 
 13  D

  and should_run_async(code)


In [6]:
df = df.drop(['Date received', 'Sub-issue', 'Company public response', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'], axis=1)

  and should_run_async(code)


In [7]:
data = df['Consumer complaint narrative'].tolist()

  and should_run_async(code)


In [8]:
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
data = [sent.lower() for sent in data] #lowecase removal
data = [re.sub(r'\s+', ' ', sent) for sent in data]

  and should_run_async(code)


In [9]:
# Tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))

  and should_run_async(code)


In [10]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'xxxx', 'xx', 'xxxxxx', 'xxxxxxxx', 'xxxxx'])
for key in df['Company'].value_counts().keys():
    stop_words.extend([key.lower()])

  and should_run_async(code)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data_words = remove_stopwords(data_words)

  and should_run_async(code)


In [12]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

  and should_run_async(code)


In [13]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN','VERB']) #select noun and verb

  and should_run_async(code)


In [14]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10, 
                             token_pattern='[a-zA-Z0-9]{3,}',  
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  and should_run_async(code)


In [15]:
search_params = {'n_components': [30], 'learning_decay': [0.7]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

  and should_run_async(code)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.7], 'n_components': [30]})

In [16]:
# Best LDA model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.7, 'n_components': 30}
Best Log Likelihood Score:  -3270141.410222464
Model Perplexity:  331.40886003844554


In [17]:
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)

df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,dominant_topic
Doc0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.23,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,26
Doc1,0.06,0.0,0.06,0.23,0.0,0.07,0.0,0.0,0.0,0.01,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.18,0.2,0.0,0.0,0.05,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
Doc3,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.03,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.04,16
Doc4,0.0,0.0,0.03,0.0,0.0,0.0,0.04,0.0,0.03,0.0,0.0,0.0,0.08,0.0,0.0,0.1,0.4,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.14,0.05,16
Doc5,0.03,0.14,0.0,0.0,0.07,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.21,0.0,8
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.0,0.11,19
Doc7,0.05,0.0,0.0,0.0,0.1,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,5
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.62,0.0,28
Doc9,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.03,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.05,0.11,0.23,0.0,0.01,0.0,0.0,0.09,0.0,0.01,0.0,0.08,0.01,11


In [18]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

  and should_run_async(code)


Unnamed: 0,Topic Num,Num Documents
0,28,11312
1,18,6578
2,8,3636
3,11,3308
4,7,3151
5,10,2710
6,21,2175
7,16,2084
8,15,2069
9,5,1725


In [19]:
def show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,complaint,file,bankruptcy,cfpb,court,attorney,include,submit,union,case,continue,state,attach,matter,company
Topic 1,information,letter,verify,request,credit,advise,send,process,investigation,receive,dispute,contact,bureaus,remove,submit
Topic 2,company,respond,require,dept,day,comply,law,report,failure,prevent,demand,complaint,record,agreement,regulation
Topic 3,fraud,live,scam,pnc,seller,apartment,rent,child,commit,wife,person,cell,mastercard,pin,protect
Topic 4,provide,receive,dispute,email,service,refund,request,bank,claim,contact,return,issue,date,customer,purchase
Topic 5,mortgage,loan,pay,insurance,escrow,home,company,property,refinance,tax,taxis,amount,rate,lender,closing
Topic 6,well,fargo,account,acct,open,charge,request,balance,receive,make,amount,dispute,credit,act,write
Topic 7,day,item,credit,report,dispute,file,receive,delete,regard,datum,information,response,demand,investigation,investigate
Topic 8,debt,collection,account,provide,report,credit,contract,company,signature,request,proof,allege,creditor,owe,validation
Topic 9,rule,people,citizen,release,recall,protect,group,divorce,evidence,research,burden,select,record,reinstate,reveal


In [20]:
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

  and should_run_async(code)


Unnamed: 0,aargon,abandon,abide,ability,able,absence,absolve,abundance,abuse,abuser,...,yard,year,yell,yesterday,yield,york,yrs,zero,zip,zone
Topic0,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,77.09344,6.429104,...,0.033333,318.977602,0.033333,0.033333,0.033333,92.114078,0.033333,0.033333,0.033333,0.033333
Topic1,0.033333,0.033333,0.03334,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033343,0.033333,0.033333,0.033333,0.033333
Topic2,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333
Topic3,0.033333,26.294625,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033335,0.033333,0.033333,0.033333
Topic4,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,1.902979,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,32.059986,0.033334


In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return topic, topic_probability_scores
# Predict the topic
mytext = 'My complaint is with Carrington Mortgage Services. The company is charging property inspection payments of XXXX along with mortgage of XXXX the dates are from XX/XX/XXXX until present, also attorney fees. This makes my payments double. Carrington is also Charges a fee of XXXX unknown.'
topic, prob_scores = predict_topic(text = mytext)
print(topic)

In [21]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  and should_run_async(code)


In [None]:
from sklearn.metrics.pairwise import euclidean_distances
def similar_documents(text, doc_topic_probs, documents = data, nlp=nlp, top_n=5, verbose=False):
    topic, x  = predict_topic(text)
    dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0]
    doc_ids = np.argsort(dists)[:top_n]
    if verbose:        
        print("Topic KeyWords: ", topic)
        print("Topic Prob Scores of text: ", np.round(x, 1))
        print("Most Similar Doc's Probs:  ", np.round(doc_topic_probs[doc_ids], 1))
    return doc_ids, np.take(documents, doc_ids)

In [None]:
mytext = ['I have an issue with closing my bank account']
doc_ids, docs = similar_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=5, verbose=True)
print('\n', docs[0][:500])