In [1]:
import numpy as np
import pandas as pd
import pickle
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.utils import simple_preprocess
from pprint import pprint
from sklearn.metrics.pairwise import euclidean_distances
from nltk.corpus import stopwords

In [3]:
dataset = pd.read_csv('../data/cfpb-selected.csv')
dataset.head()

  and should_run_async(code)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/18/21,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I previously asked the credit bureaus for a in...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,191XX,,Consent provided,Web,03/18/21,Closed with explanation,Yes,,4225218
1,03/23/21,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Investigation took more than 30 days,"XXXX, Experian and XXXX, FHA Resource Center, ...",Company has responded to the consumer and the ...,Experian Information Solutions Inc.,UT,840XX,,Consent provided,Web,03/23/21,Closed with explanation,Yes,,4239496
2,04/07/21,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,On XX/XX/2021 sent a letter regarding inaccura...,,"EQUIFAX, INC.",FL,322XX,,Consent provided,Web,04/07/21,Closed with explanation,Yes,,4278158
3,04/09/21,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,I received an email about a decrease in my con...,,"EQUIFAX, INC.",GA,,,Consent provided,Web,04/09/21,Closed with explanation,Yes,,4283212
4,03/25/21,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,There is a fraudulent bank account that is rep...,Company has responded to the consumer and the ...,"Fidelity National Information Services, Inc. (...",NY,114XX,,Consent provided,Web,03/25/21,Closed with explanation,Yes,,4245653


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49491 entries, 0 to 49490
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date received                 49491 non-null  object 
 1   Product                       49491 non-null  object 
 2   Sub-product                   49491 non-null  object 
 3   Issue                         49491 non-null  object 
 4   Sub-issue                     49491 non-null  object 
 5   Consumer complaint narrative  49491 non-null  object 
 6   Company public response       49491 non-null  object 
 7   Company                       49491 non-null  object 
 8   State                         49491 non-null  object 
 9   ZIP code                      49491 non-null  object 
 10  Tags                          49491 non-null  object 
 11  Consumer consent provided?    49491 non-null  object 
 12  Submitted via                 49491 non-null  object 
 13  D

  and should_run_async(code)


In [5]:
dataset['Issue'].value_counts()

  and should_run_async(code)


Incorrect information on your report                                                18175
Problem with a credit reporting company's investigation into an existing problem    14768
Improper use of your report                                                          2871
Attempts to collect debt not owed                                                    2564
Trouble during payment process                                                       2543
Managing an account                                                                  1967
Problem with a purchase shown on your statement                                      1644
Struggling to pay mortgage                                                            882
Closing on a mortgage                                                                 689
Unable to get your credit report or credit score                                      652
Fees or interest                                                                      620
Closing an

In [6]:
dataset = dataset.drop(['Date received', 'Sub-issue', 'Company public response', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'], axis=1)

  and should_run_async(code)


## Pre-processing

In [7]:
data = dataset['Consumer complaint narrative'].tolist()

  and should_run_async(code)


In [8]:
# Remove emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# lowecase
data = [sent.lower() for sent in data] #lowecase
# Remove newline 
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

  and should_run_async(code)


#### Tokenization

In [9]:
nltk.download("punkt")

tokenized_data = []
token_data = [nltk.word_tokenize(words) for words in data]
for words in token_data:
    new_words = [word for word in words if word.isalnum()]
    tokenized_data.append(new_words)
#ef sent_to_words(sentences):
#    for sentence in sentences:
#        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
#tokenized_data = list(sent_to_words(data))

  and should_run_async(code)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Stop words

In [10]:
#STOP WORDS
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'xxxx', 'xx', 'well', 'fargo', 'citibank', 'xxxxxx', 'xxxxxxxx'])
print(len(stop_words))
for key, value in dataset['Company'].value_counts().items():
    if value > 1000:
        word = key.lower().split(' ')
        for x in word:
            if len(x) > 1:
                stop_words.extend(x)
print(len(stop_words))

191
346


  and should_run_async(code)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
tokenized_data = remove_stopwords(tokenized_data)

  and should_run_async(code)


##### Pos tags

In [12]:
nltk.download('averaged_perceptron_tagger')
tagged_data = []
for token in tokenized_data:
    tagged_data.append(nltk.pos_tag(token))
#tagged = nltk.pos_tag(tokenized_data)
#singular_token = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]

  and should_run_async(code)
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
post_tag_words = []
for tagged in tagged_data:
    data_words = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]
    post_tag_words.append(data_words)

  and should_run_async(code)


In [14]:
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]
post_tag_new = []
for word in post_tag_words:
    word = remove_values_from_list(word, 'xxxx')
    post_tag_new.append(word)

  and should_run_async(code)


#### Lemmatization

In [15]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(post_tag_new, allowed_postags=['NOUN','VERB']) #select noun and verb

  and should_run_async(code)


### Vectorization

In [16]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10, 
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=50000
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  and should_run_async(code)


## MODEL

In [22]:
search_params = {'n_components': [20, 25, 30], 'learning_decay': [0.7]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

  and should_run_async(code)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.7], 'n_components': [20, 25, 30]})

In [23]:
# Best LDA model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.7, 'n_components': 20}
Best Log Likelihood Score:  -1863778.484470766
Model Perplexity:  222.54274291732986


In [24]:
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)

df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
Doc0,0.0,0.17,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.0,0.0,0.0,0.34,0.0,0.19,0.0,0.0,0.0,0.0,13
Doc1,0.11,0.0,0.08,0.0,0.16,0.03,0.0,0.0,0.0,0.0,0.54,0.0,0.0,0.04,0.02,0.0,0.0,0.0,0.0,0.0,10
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
Doc3,0.0,0.11,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.74,0.0,0.0,0.0,0.0,0.0,0.0,13
Doc4,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.16,0.0,0.0,0.0,0.54,0.0,0.0,0.0,0.0,0.0,0.0,13
Doc5,0.18,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.29,0.0,0.0,0.0,0.0,13
Doc6,0.01,0.01,0.01,0.01,0.53,0.01,0.01,0.34,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,4
Doc7,0.0,0.0,0.0,0.6,0.04,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,3
Doc8,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,1
Doc9,0.06,0.17,0.0,0.0,0.0,0.0,0.0,0.06,0.11,0.04,0.04,0.15,0.09,0.0,0.0,0.0,0.0,0.24,0.04,0.0,17


In [25]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

  and should_run_async(code)


Unnamed: 0,Topic Num,Num Documents
0,1,11799
1,13,6107
2,6,4726
3,0,3967
4,7,3716
5,2,3003
6,4,2902
7,14,2845
8,16,1871
9,8,1824


In [26]:
def show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,account,date,number,name,balance,company,report,creditor,credit,collection,status,violation,mine,reason,record
Topic 1,credit,report,account,information,bureaus,score,letter,reporting,company,time,dispute,bureau,file,bankruptcy,equifax
Topic 2,identity,theft,report,victim,account,credit,police,collection,information,item,agency,fraud,block,trade,affidavit
Topic 3,company,mortgage,insurance,year,property,escrow,service,tax,time,customer,taxis,amount,policy,settlement,pay
Topic 4,fee,time,month,call,phone,pay,day,bill,payment,service,car,point,care,week,customer
Topic 5,transunion,complaint,request,reporting,investigation,information,act,respond,dispute,law,attorney,violation,response,agency,protection
Topic 6,day,item,credit,dispute,report,letter,response,investigation,information,result,account,datum,demand,section,regard
Topic 7,bank,account,check,money,fund,transaction,day,customer,chase,capital,time,deposit,refund,access,amount
Topic 8,card,credit,fraud,charge,purchase,number,company,service,merchant,year,security,claim,name,address,application
Topic 9,people,help,thank,family,company,complaint,time,situation,way,support,month,store,matter,reason,submit


In [88]:
# Save best model
pickle.dump(model.best_estimator_, open('../new_preprocessing_models/best-n10-p315', 'wb'))
# Save grid
pickle.dump(model, open('../new_preprocessing_models/grid-n10-p315', 'wb'))

  and should_run_async(code)
