In [1]:
import numpy as np
import pandas as pd
import pickle
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.utils import simple_preprocess
from pprint import pprint
from sklearn.metrics.pairwise import euclidean_distances
from nltk.corpus import stopwords

In [2]:
dataset = pd.read_csv('../data/mortgage.csv')
dataset.head()

  and should_run_async(code)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/28/19,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,"In dispute of the loan # XXXX, for XXXX XXXX t...",Company believes it acted appropriately as aut...,"SELECT PORTFOLIO SERVICING, INC.",GA,300XX,,Consent provided,Web,03/28/19,Closed with explanation,Yes,,3193709
1,02/21/19,Mortgage,FHA mortgage,Struggling to pay mortgage,,My complaint is with Carrington Mortgage Servi...,,"CARRINGTON MORTGAGE SERVICES, LLC",GA,312XX,,Consent provided,Web,02/21/19,Closed with explanation,Yes,,3158652
2,06/11/19,Mortgage,Home equity loan or line of credit (HELOC),Trouble during payment process,,As of XX/XX/2019 PHH Mortgage Services purchas...,,PHH Mortgage Services Corporation,FL,347XX,,Consent provided,Web,06/11/19,Closed with explanation,Yes,,3270600
3,11/29/19,Mortgage,Conventional home mortgage,Trouble during payment process,,We started our mortgage with Loan Depot in XX/...,Company believes complaint is the result of an...,"LD Holdings Group, LLC",NJ,,,Consent provided,Web,11/29/19,Closed with monetary relief,Yes,,3453669
4,10/23/19,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,"XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, XXXX ...",,"Community Loan Servicing, LLC (formerly known ...",CA,92021,,Consent provided,Web,11/05/19,Closed with explanation,Yes,,3415392


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32805 entries, 0 to 32804
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date received                 32805 non-null  object 
 1   Product                       32805 non-null  object 
 2   Sub-product                   32805 non-null  object 
 3   Issue                         32805 non-null  object 
 4   Sub-issue                     32805 non-null  object 
 5   Consumer complaint narrative  32805 non-null  object 
 6   Company public response       32805 non-null  object 
 7   Company                       32805 non-null  object 
 8   State                         32805 non-null  object 
 9   ZIP code                      32805 non-null  object 
 10  Tags                          32805 non-null  object 
 11  Consumer consent provided?    32805 non-null  object 
 12  Submitted via                 32805 non-null  object 
 13  D

  and should_run_async(code)


In [4]:
dataset['Issue'].value_counts()

  and should_run_async(code)


Trouble during payment process                                                      15884
Struggling to pay mortgage                                                           7092
Applying for a mortgage or refinancing an existing mortgage                          5085
Closing on a mortgage                                                                3615
Incorrect information on your report                                                  677
Problem with a credit reporting company's investigation into an existing problem      335
Improper use of your report                                                            75
Unable to get your credit report or credit score                                       19
Credit monitoring or identity theft protection services                                16
Problem with fraud alerts or security freezes                                           7
Name: Issue, dtype: int64

In [5]:
dataset = dataset.drop(['Date received', 'Sub-issue', 'Company public response', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID'], axis=1)

  and should_run_async(code)


## Pre-processing

In [6]:
data = dataset['Consumer complaint narrative'].tolist()

  and should_run_async(code)


In [7]:
# Remove emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# lowecase
data = [sent.lower() for sent in data] #lowecase
# Remove newline 
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

  and should_run_async(code)


#### Tokenization

In [8]:
nltk.download("punkt")
tokenized_data = []
token_data = [nltk.word_tokenize(words) for words in data]
for words in token_data:
    new_words = [word for word in words if word.isalnum()]
    tokenized_data.append(new_words)
#def sent_to_words(sentences):
#    for sentence in sentences:
#        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
#tokenized_data = list(sent_to_words(data))

  and should_run_async(code)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
#tokenized_data = list(sent_to_words(data))

  and should_run_async(code)


##### Stop words

In [9]:
#STOP WORDS
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'xxxx', 'xx', 'well', 'fargo', 'citibank', 'xxxxxx', 'xxxxxxxx', 'cooper', 'mortgage'])
for key, value in dataset['Company'].value_counts().items():
    if value > 1000:
        word = key.lower().split(' ')
        for x in word:
            if len(x) > 1:
                stop_words.extend(x)

193


  and should_run_async(code)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
tokenized_data = remove_stopwords(tokenized_data)

  and should_run_async(code)


In [11]:
with open('../run1/tokenized_data', 'wb') as f:
    pickle.dump(tokenized_data, f)

  and should_run_async(code)


##### Pos tags

In [12]:
nltk.download('averaged_perceptron_tagger')
tagged_data = []
for token in tokenized_data:
    tagged_data.append(nltk.pos_tag(token))
#tagged = nltk.pos_tag(tokenized_data)
#singular_token = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]

  and should_run_async(code)
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
post_tag_words = []
for tagged in tagged_data:
    data_words = [word for word, tag in tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP')]
    post_tag_words.append(data_words)

  and should_run_async(code)


In [14]:
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]
post_tag_new = []
for word in post_tag_words:
    word = remove_values_from_list(word, 'xxxx')
    post_tag_new.append(word)

  and should_run_async(code)


In [15]:
with open('../run1/pos_tags', 'wb') as f:
    pickle.dump(post_tag_new, f)

  and should_run_async(code)


#### Lemmatization

In [16]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(post_tag_new, allowed_postags=['NOUN','VERB']) #select noun and verb

  and should_run_async(code)


In [17]:
with open('../run1/data_lemmatized', 'wb') as f:
    pickle.dump(data_lemmatized, f)

  and should_run_async(code)


#### Vectorization

In [18]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10, 
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=50000
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  and should_run_async(code)


In [19]:
with open('../run1/data_vectorized', 'wb') as f:
    pickle.dump(data_vectorized, f)

  and should_run_async(code)


In [20]:
with open('../run1/vectorizer', 'wb') as f:
    pickle.dump(vectorizer, f)

  and should_run_async(code)


## MODEL

In [21]:
search_params = {'n_components': [10], 'learning_decay': [0.7]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search ClassA
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

  and should_run_async(code)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.7], 'n_components': [10]})

In [22]:
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) 

  and should_run_async(code)


Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -2300100.998591577
Model Perplexity:  352.2074019052543


In [25]:
with open('../run1/model', 'wb') as f:
    pickle.dump(best_lda_model, f)

  and should_run_async(code)


In [26]:
with open('../run1/grid-model', 'wb') as f:
    pickle.dump(model, f)

  and should_run_async(code)


### END OF TRAINING

In [27]:
# Load model
with open('../run1/model', 'rb') as f:
    new_model = pickle.load(f)

  and should_run_async(code)


In [28]:
# Load vectorizer
with open('../run1/vectorizer', 'rb') as f:
    new_vec = pickle.load(f)

  and should_run_async(code)


In [29]:
# Load grid model
with open('../run1/grid-model', 'rb') as f:
    new_grid = pickle.load(f)

  and should_run_async(code)


In [30]:
# Load data vectorized
with open('../run1/data_vectorized', 'rb') as f:
    new_data_vectorized = pickle.load(f)

  and should_run_async(code)


In [32]:
# Load data lemmatized
with open('../run1/data_lemmatized', 'rb') as f:
    new_data_lemmatized = pickle.load(f)

  and should_run_async(code)


In [33]:
lda_output = new_model.transform(new_data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(new_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)

  and should_run_async(code)


In [34]:
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.34,0.0,0.17,0.02,0.0,0.37,0.01,0.0,0.06,0.03,5
Doc1,0.68,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
Doc2,0.31,0.0,0.0,0.08,0.0,0.14,0.05,0.11,0.0,0.31,0
Doc3,0.35,0.0,0.1,0.0,0.23,0.0,0.0,0.0,0.2,0.1,0
Doc4,0.0,0.0,0.11,0.0,0.0,0.42,0.0,0.0,0.0,0.45,9
Doc5,0.07,0.0,0.0,0.0,0.84,0.0,0.0,0.0,0.0,0.06,4
Doc6,0.0,0.01,0.34,0.0,0.0,0.34,0.0,0.28,0.0,0.02,2
Doc7,0.23,0.0,0.0,0.0,0.11,0.0,0.1,0.0,0.27,0.27,8
Doc8,0.38,0.01,0.01,0.01,0.53,0.01,0.01,0.01,0.01,0.01,4
Doc9,0.03,0.04,0.26,0.0,0.0,0.0,0.03,0.41,0.01,0.21,7


In [35]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

  and should_run_async(code)


Unnamed: 0,Topic Num,Num Documents
0,9,6396
1,0,6359
2,8,5006
3,7,4917
4,4,2326
5,5,2009
6,3,1781
7,2,1671
8,6,1372
9,1,968


In [36]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=new_vec, lda_model=new_model, n_words=15)

  and should_run_async(code)


In [37]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,payment,account,amount,loan,fee,month,statement,balance,interest,time,pay,bank,company,charge,money
Topic 1,bank,house,home,buyer,year,property,time,america,name,money,equity,branch,loan,boa,line
Topic 2,property,attorney,sale,foreclosure,court,bankruptcy,law,loan,debt,note,state,mitigation,loss,case,trust
Topic 3,credit,forbearance,payment,report,month,score,loan,time,day,plan,care,company,letter,option,program
Topic 4,escrow,taxis,tax,property,account,amount,year,pmi,pay,bill,value,analysis,shortage,county,month
Topic 5,loan,document,complaint,letter,information,request,statement,response,consumer,copy,date,servicer,borrower,cfpb,number
Topic 6,insurance,policy,company,payoff,chase,homeowner,fund,home,escrow,letter,flood,sls,coverage,amount,pay
Topic 7,loan,modification,home,well,year,time,income,month,payment,help,house,foreclosure,work,letter,assistance
Topic 8,loan,rate,closing,refinance,process,document,lender,application,day,time,interest,appraisal,officer,fee,home
Topic 9,call,time,day,phone,company,check,customer,number,service,information,letter,email,account,week,loan


In [40]:
df_topic_keywords = pd.DataFrame(new_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

  and should_run_async(code)


Unnamed: 0,abatement,abide,ability,absence,abundance,abuse,academy,accelerate,acceleration,accelerator,...,xxxxt,xxxxx,yard,year,yesterday,yield,york,yrs,zip,zone
Topic0,0.100017,0.251627,117.456609,0.100021,0.100133,0.100024,0.100013,52.344655,11.659143,25.191526,...,0.100053,0.10025,0.100011,2928.607259,21.384319,0.100022,0.10002,0.10005,0.100035,0.100013
Topic1,0.176631,0.101433,53.457297,0.100021,0.100092,26.862555,0.100073,0.100024,0.100012,0.100042,...,0.100039,0.100067,0.100067,1591.71431,0.100043,0.100091,0.100038,9.446351,0.100058,0.100053
Topic2,0.100003,0.198894,50.248891,28.071598,15.663374,153.183882,0.100038,12.808329,42.17878,0.10001,...,0.100017,0.100237,0.100015,378.584428,0.10009,0.100095,276.44442,0.100017,0.100058,0.100029
Topic3,0.100002,0.100016,284.524328,0.100023,0.100071,0.100014,0.100045,0.100025,0.100024,0.100066,...,0.100007,0.100151,0.100023,376.161661,40.264001,0.100037,0.100018,0.100021,18.635756,0.10002
Topic4,27.19254,0.100011,0.100027,0.100011,0.100015,0.100027,0.100048,0.10001,0.100067,0.1,...,2.985261,8.064018,0.100044,3178.574745,0.39768,6.039315,0.100035,0.10003,0.100016,177.305037


In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [44]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp, model=new_model):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = new_vec.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = model.transform(mytext_4)
    #topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return topic_probability_scores
# Predict the topic
mytext = 'My complaint is with Carrington Mortgage Services. The company is charging property inspection payments of XXXX along with mortgage of XXXX the dates are from XX/XX/XXXX until present, also attorney fees. This makes my payments double. Carrington is also Charges a fee of XXXX unknown.'
prob_scores = predict_topic(text = mytext)

  and should_run_async(code)


In [45]:
def similar_documents(text, doc_topic_probs, documents = data, nlp=nlp, top_n=5, verbose=False):
    x  = predict_topic(text)
    dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0]
    doc_ids = np.argsort(dists)[:top_n]
    if verbose:        
        # print("Topic KeyWords: ", topic)
        print("Topic Prob Scores of text: ", np.round(x, 1))
        print("Most Similar Doc's Probs:  ", np.round(doc_topic_probs[doc_ids], 1))
    return doc_ids, np.take(documents, doc_ids)

  and should_run_async(code)


In [46]:
mytext = ['I did not feel that the savings was enough to justify the fees. The several person worked numbers and got to a number that seemed reasonable. I explained that I may not be staying in the home for more than 8 months. After the refi was done I received a check that I did not ask for. I called and they stated this was mine to keep. After I started paying my mortgage I am now told that my payment is increasing over {$100.00} due to an escrow shortage. I feel that I was intentionally mislead about the payment to book the loan and that the insurance was left out to increase the loan after the loan closed. The escrow should have increased {$19.00} per month not over {$100.00} due to a change in insurance premium from {$820.00} to {$1000.00}. I was not willing to close on the loan and pay the fees unless I had substantial savings and I feel that they were deceptive just to get the extra fees. I would have never refinanced the loan for such a small savings monthly. ']
doc_ids, docs = similar_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=5, verbose=True)
print('\n', docs[0][:500])

  and should_run_async(code)


[array([0.11153304, 0.0016398 , 0.00163972, 0.00163963, 0.15408753,
       0.00163973, 0.05836742, 0.00163987, 0.42236204, 0.24545122])]
Topic Prob Scores of text:  [[0.1 0.  0.  0.  0.2 0.  0.1 0.  0.4 0.2]]
Most Similar Doc's Probs:   [[0.2 0.  0.  0.  0.2 0.  0.  0.  0.4 0.3]
 [0.1 0.  0.  0.  0.1 0.  0.1 0.  0.4 0.3]
 [0.2 0.  0.  0.  0.1 0.  0.1 0.  0.3 0.3]
 [0.1 0.  0.  0.  0.2 0.  0.  0.  0.5 0.2]
 [0.1 0.  0.  0.  0.2 0.1 0.1 0.  0.3 0.2]]

 please investigate freedom mortgage. i have a mortgage that originated at xxxx and this company refuses to remove the mip as i am now elligable at over 20 % equity and i meet all other requirements. the original loan number is xxxx xxxx / file number xxxx xxxx / mortgage ins case number xxxx xxxx xxxx. freedom mortgage is the refinance company and they have refused any assistance to make sure this mip is removed in a timely manner. freedom mortgage refinanced the amount of xxxx and they refuse t
