In [58]:
import numpy as np
import pandas as pd
import pickle
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.utils import simple_preprocess
from pprint import pprint
from sklearn.metrics.pairwise import euclidean_distances
from nltk.corpus import stopwords

  and should_run_async(code)


In [2]:
dataset = pd.read_csv('../data/mortgage.csv')
dataset.head()

  and should_run_async(code)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/28/19,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,"In dispute of the loan # XXXX, for XXXX XXXX t...",Company believes it acted appropriately as aut...,"SELECT PORTFOLIO SERVICING, INC.",GA,300XX,,Consent provided,Web,03/28/19,Closed with explanation,Yes,,3193709
1,02/21/19,Mortgage,FHA mortgage,Struggling to pay mortgage,,My complaint is with Carrington Mortgage Servi...,,"CARRINGTON MORTGAGE SERVICES, LLC",GA,312XX,,Consent provided,Web,02/21/19,Closed with explanation,Yes,,3158652
2,06/11/19,Mortgage,Home equity loan or line of credit (HELOC),Trouble during payment process,,As of XX/XX/2019 PHH Mortgage Services purchas...,,PHH Mortgage Services Corporation,FL,347XX,,Consent provided,Web,06/11/19,Closed with explanation,Yes,,3270600
3,11/29/19,Mortgage,Conventional home mortgage,Trouble during payment process,,We started our mortgage with Loan Depot in XX/...,Company believes complaint is the result of an...,"LD Holdings Group, LLC",NJ,,,Consent provided,Web,11/29/19,Closed with monetary relief,Yes,,3453669
4,10/23/19,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,"XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX ...",,"Community Loan Servicing, LLC (formerly known ...",CA,92021,,Consent provided,Web,11/05/19,Closed with explanation,Yes,,3415392


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32805 entries, 0 to 32804
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date received                 32805 non-null  object 
 1   Product                       32805 non-null  object 
 2   Sub-product                   32805 non-null  object 
 3   Issue                         32805 non-null  object 
 4   Sub-issue                     32805 non-null  object 
 5   Consumer complaint narrative  32805 non-null  object 
 6   Company public response       32805 non-null  object 
 7   Company                       32805 non-null  object 
 8   State                         32805 non-null  object 
 9   ZIP code                      32805 non-null  object 
 10  Tags                          32805 non-null  object 
 11  Consumer consent provided?    32805 non-null  object 
 12  Submitted via                 32805 non-null  object 
 13  D

  and should_run_async(code)


In [4]:
dataset['Issue'].value_counts()

  and should_run_async(code)


Trouble during payment process                                                      15884
Struggling to pay mortgage                                                           7092
Applying for a mortgage or refinancing an existing mortgage                          5085
Closing on a mortgage                                                                3615
Incorrect information on your report                                                  677
Problem with a credit reporting company's investigation into an existing problem      335
Improper use of your report                                                            75
Unable to get your credit report or credit score                                       19
Credit monitoring or identity theft protection services                                16
Problem with fraud alerts or security freezes                                           7
Name: Issue, dtype: int64

In [22]:
dataset = dataset.drop(['Date received', 'Sub-issue', 'Company public response', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'], axis=1)

  and should_run_async(code)


## Data Pre-processing

In [5]:
data = dataset['Consumer complaint narrative'].tolist()

  and should_run_async(code)


In [6]:
# Remove emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# lowecase
data = [sent.lower() for sent in data] #lowecase
# Remove newline 
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

  and should_run_async(code)


#### Tokenization

In [8]:
nltk.download("punkt")

tokenized_data = []
token_data = [nltk.word_tokenize(words) for words in data]
for words in token_data:
    new_words = [word for word in words if word.isalnum()]
    tokenized_data.append(new_words)

  and should_run_async(code)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
#tokenized_data = list(sent_to_words(data))

  and should_run_async(code)


##### Stop words

In [9]:
#STOP WORDS
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'xxxx', 'xx', 'well', 'fargo', 'citibank', 'xxxxxx', 'xxxxxxxx', 'cooper', 'wells'])
print(len(stop_words))
for key, value in dataset['Company'].value_counts().items():
    if value > 1000:
        word = key.lower().split(' ')
        for x in word:
            if len(x) > 1:
                stop_words.extend(x)
print(len(stop_words))

193
406


  and should_run_async(code)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
tokenized_data = remove_stopwords(tokenized_data)

  and should_run_async(code)


##### Pos tags

In [11]:
nltk.download('averaged_perceptron_tagger')
tagged_data = []
for token in tokenized_data:
    tagged_data.append(nltk.pos_tag(token))
#tagged = nltk.pos_tag(tokenized_data)
#singular_token = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]

  and should_run_async(code)
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
post_tag_words = []
for tagged in tagged_data:
    #Retaining only Nouns
    data_words = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]
    post_tag_words.append(data_words)

  and should_run_async(code)


In [13]:
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]
post_tag_new = []
for word in post_tag_words:
    word = remove_values_from_list(word, 'xxxx')
    post_tag_new.append(word)

  and should_run_async(code)


#### Lemmatization

In [17]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(post_tag_new, allowed_postags=['NOUN']) #select noun

  and should_run_async(code)


NameError: name 'post_tag_new' is not defined

#### Vectorization

In [15]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10, 
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=50000
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  and should_run_async(code)


## MODEL

In [16]:
search_params = {'n_components': [10], 'learning_decay': [0.7]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

  and should_run_async(code)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.7], 'n_components': [10]})

In [17]:
# Best LDA model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -2344243.0761714242
Model Perplexity:  330.7983257088824


## Saving files

In [None]:
with open('../runs/run3-best/data', 'wb') as f:
    pickle.dump(data, f)

In [18]:
with open('../runs/run3-best/tokenized_data', 'wb') as f:
    pickle.dump(tokenized_data, f)

  and should_run_async(code)


In [19]:
with open('../runs/run3-best/pos_tags', 'wb') as f:
    pickle.dump(post_tag_new, f)

  and should_run_async(code)


In [20]:
with open('../runs/run3-best/data_lemmatized', 'wb') as f:
    pickle.dump(data_lemmatized, f)

  and should_run_async(code)


In [21]:
with open('../runs/run3-best/data_vectorized', 'wb') as f:
    pickle.dump(data_vectorized, f)

  and should_run_async(code)


In [22]:
with open('../runs/run3-best/vectorizer', 'wb') as f:
    pickle.dump(vectorizer, f)

  and should_run_async(code)


In [23]:
with open('../runs/run3-best/model', 'wb') as f:
    pickle.dump(best_lda_model, f)

  and should_run_async(code)


In [24]:
with open('../runs/run3-best/grid-model', 'wb') as f:
    pickle.dump(model, f)

  and should_run_async(code)


### Load files

In [9]:
with open('../runs/run3-best/pos_tags', 'rb') as f:
    pos_tags = pickle.load(f)

  and should_run_async(code)


In [10]:
with open('../runs/run3-best/data_lemmatized', 'rb') as f:
    data_lemmatized = pickle.load(f)

  and should_run_async(code)


In [11]:
with open('../runs/run3-best/data_vectorized', 'rb') as f:
    data_vectorized = pickle.load(f)

  and should_run_async(code)


In [12]:
with open('../runs/run3-best/vectorizer', 'rb') as f:
    vectorizer = pickle.load(f)

  and should_run_async(code)


In [13]:
with open('../runs/run3-best/model', 'rb') as f:
    best_lda_model = pickle.load(f)

  and should_run_async(code)


In [14]:
with open('../runs/run3-best/grid-model', 'rb') as f:
    grid_model = pickle.load(f)

  and should_run_async(code)


## Saving over

In [15]:
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
#doc id
docid = dataset['Complaint ID']
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docid)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)

df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0_level_0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Complaint ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3193709,0.0,0.0,0.04,0.03,0.6,0.03,0.0,0.01,0.03,0.27,4
3158652,0.01,0.24,0.01,0.19,0.35,0.01,0.01,0.01,0.01,0.19,4
3270600,0.25,0.0,0.09,0.21,0.18,0.0,0.17,0.05,0.05,0.0,0
3453669,0.0,0.08,0.0,0.43,0.0,0.19,0.0,0.28,0.0,0.0,3
3415392,0.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0
3237767,0.04,0.0,0.0,0.0,0.0,0.28,0.0,0.65,0.0,0.0,7
3605629,0.05,0.24,0.17,0.0,0.0,0.08,0.0,0.0,0.0,0.46,9
3940465,0.0,0.19,0.0,0.42,0.12,0.0,0.0,0.26,0.0,0.0,3
3478699,0.01,0.01,0.01,0.01,0.12,0.01,0.16,0.66,0.01,0.01,7
3316113,0.23,0.65,0.1,0.0,0.0,0.0,0.03,0.0,0.0,0.0,1


In [160]:
with open('../runs/run3-best/lda_output', 'wb') as f:
    pickle.dump(lda_output, f)

  and should_run_async(code)


In [26]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

  and should_run_async(code)


Unnamed: 0,Topic Num,Num Documents
0,3,7119
1,0,5986
2,5,4430
3,1,4078
4,7,3902
5,6,2196
6,9,1975
7,4,1326
8,8,1001
9,2,792


In [27]:
def show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,time,call,day,company,phone,mortgage,email,information,number,letter,check,week,customer,hour,manager
Topic 1,loan,modification,home,payment,foreclosure,mortgage,sale,time,letter,document,year,application,date,servicer,process
Topic 2,bank,law,court,chase,state,bankruptcy,property,case,attorney,consumer,violation,action,note,debt,act
Topic 3,payment,mortgage,month,account,bank,amount,time,company,loan,money,pay,check,day,fund,balance
Topic 4,fee,account,statement,service,customer,charge,loan,amount,letter,date,balance,number,information,complaint,error
Topic 5,loan,home,closing,process,document,application,appraisal,lender,mortgage,time,day,officer,value,cost,date
Topic 6,credit,loan,rate,mortgage,interest,report,refinance,payoff,year,score,time,bank,company,freedom,day
Topic 7,insurance,escrow,taxis,mortgage,tax,property,company,account,policy,amount,year,pay,payment,letter,bill
Topic 8,forbearance,mortgage,plan,month,program,income,option,assistance,sps,care,portfolio,period,borrower,end,hardship
Topic 9,mortgage,loan,document,property,home,name,company,complaint,attorney,information,title,note,fraud,copy,trust


In [112]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  and should_run_async(code)


In [24]:
def predict_topic(text, nlp=nlp, model=model):
    text_1 = [nltk.word_tokenize(txt) for txt in text] 
    text_2 = lemmatization(text_1, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    text_3 = vectorizer.transform(text_2)
    topic_probability_scores = model.transform(text_3)
    topic_probability_scores = np.round(topic_probability_scores, 1)
    topic_probability_scores = topic_probability_scores.tolist()
    return topic_probability_scores

  and should_run_async(code)


(1, 10)


In [None]:
topic_mapping = {
        0: 'Customer Support Complaint : time,call,day,company,phone,mortgage',
        1: 'Foreclosure Complaint : loan,modification,home,payment,foreclosure,mortgage',
        2: 'Legal Complaint : banklaw,court,chase,state,bankruptcy',
        3: 'Payment Processing Complaint : payment,mortgage,month,account,bank,amount	',
        4: 'Account Statement Complaint : fee,account,statement,service,customer,charge',
        5: 'Mortgage Closing Complaint : loan,home,closing,process,document,application',
        6: 'Refinance Complaint : credit,loan,rate,interest,report,refinance', 
        7: 'Escrow Complaint : insurance,escrow,mortgage,tax,property,company',
        8: 'Forbearance Complaint : forbearance,mortgage,plan,month,program,income',
        9: 'Fraud/Cheating Complaint : loan,document,property,home,name,fraud',
    }