In [97]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import pandas as pd
import numpy as np
import time
pd.set_option('display.max_colwidth', -1)

In [85]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ",".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [91]:
df = pd.read_csv('../data/with_summarized_tokenized.csv', index_col='Unnamed: 0')

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

In [20]:
X = vectorizer.fit_transform(df['summ'])

### Latent Dirichlet Allocation

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=8, n_jobs=-1, random_state=42)
lda.fit(X.T)
print("Model perplexity: {}".format(lda.perplexity(X)))

In [122]:
n_top_words = 20

### Non-negative matrix factorization

In [161]:
from sklearn.decomposition import NMF
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (X.shape[0], X.shape[1]))
t0 = time.time()
nmf = NMF(n_components=12, random_state=1,
          alpha=.1, l1_ratio=.5).fit(X)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=433958 and n_features=10000...


In [157]:
nmf_topics = nmf.transform(X)
nmf_topics = pd.DataFrame(nmf_topics)

In [158]:

nmf_topics['max_prob'] = np.max(nmf_topics.values, axis=1)
nmf_topics['most_likely'] = np.argmax(nmf_topics.values, axis=1)

In [153]:
nmf_topics.shape

(433958, 10)

In [159]:
predicted_nmf_topics = pd.merge(df[['summ', 'Product']].reset_index().reset_index(), nmf_topics.reset_index(),  left_on='level_0', right_on='index')
predicted_nmf_topics.drop(columns=['level_0', 'index_x', 'index_y'], inplace=True)

In [160]:
predicted_nmf_topics

Unnamed: 0,summ,Product,0,1,2,3,4,5,6,7,8,9,max_prob,most_likely
0,like inaccurate information remove apply approve reason believe social long road reach credit bureau debt credit file extremely frustrate,Debt collection,0.005347,0.000000,0.003373,0.000000,0.000298,0.001509,0.000550,0.001014,0.000000,0.000885,0.005347,0
1,produce document authorize release information original creditor power attorney sign give information rd party knowledge consent perpetrate identity theft hand knowledge transpire allege account include account allegedly belong original creditor party prior acquisition write consent collect information share information credit report agency write sign law creditor share information party account holder authorization company credit collection serv contract report credit report fraudulently,Debt collection,0.007123,0.000000,0.004276,0.002571,0.000000,0.000000,0.000003,0.000751,0.000000,0.008355,0.008355,9
2,submit write remove pmi late payment meet original loan value house value go send letter state pay appraisal sure house not decline value lakeview loan send letter state meet ltv need broke price opinion ensure value house decrease send tell deny property value house pay able pmi remove say loan new assessment value house,Mortgage,0.000440,0.001936,0.001111,0.000000,0.003088,0.000000,0.007055,0.000151,0.000618,0.000000,0.007055,6
3,frustrate multiple time day feel like s violate right company continue send robot call text message say synchrony bank debt ashley furniture provide text message call previous phone need couple mo progress x day credit report have,Debt collection,0.002446,0.000000,0.001960,0.000000,0.007307,0.000000,0.000000,0.000000,0.000000,0.000000,0.007307,4
4,produce document authorize release information original creditor power attorney sign give information rd party knowledge consent perpetrate identity theft hand knowledge transpire allege account include account allegedly belong original creditor party prior acquisition write consent collect information share information credit report agency write sign law creditor share information party account holder authorization company choice recovery contract report credit report fraudulently,Debt collection,0.006625,0.000000,0.003695,0.002639,0.000161,0.000000,0.000036,0.000607,0.000000,0.008576,0.008576,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433953,say want apply credit card say not say file definitely close account day late letter new debit card say security breach information maybe compromise new card ask want overdraft protection say pay bill different way charge dollar call ask happen say pitcular bill send mail not control get call ask look credit report siad not look bac k week close account guy hear office come say come bring person charge woman,Bank account or service,0.000757,0.000785,0.000000,0.002916,0.009702,0.010234,0.000000,0.000000,0.000000,0.000000,0.010234,5
433954,contact chase business account phone ov er hour transfer person person open claim email package claim need sign send account credit say deny claim information chase correct identify time call refuse direct phone numb day late receive email chase email state claim deny account credit immediately get phone speak lady explain happen answer correct ask speak s ome unite state t hey phone hold minute tell available unite state ask speak supervisor get deny,Bank account or service,0.000830,0.000000,0.000000,0.003564,0.011508,0.000768,0.000000,0.000000,0.000000,0.000000,0.011508,4
433955,feel issue resolve n hsbc ha s continue bill interest fine account extort money year time time need federal protection hsbc have request investigation fraud deception obtain profit self interest prayin g thier o wn client intentionally decieve defraud mo ney order protect self hsbc s totally dishonest practice close be y hsbc account pay close hsbc accou nt pay balance confirm hsbc w ith hsbc executive phone confirm pay owe hsbc thing account close balance owe confirmation code provide verify fact time time money hsbc woul have account balance negative charge expensive fine conformation numb give assure account close money own hsbc,Bank account or service,0.000000,0.000740,0.000739,0.006159,0.002368,0.000931,0.001861,0.000000,0.000000,0.000000,0.006159,3
433956,go local well fargo branch office tell old way track go paperwork find automatically renewable personal cd open bank nj n purchase wachovia wachovia purchase have well fargo bank note original time deposit automatically renewable personal cd account numb hand instruct state nj department treasury unclaimed property nj record money,Bank account or service,0.000000,0.000000,0.000000,0.004035,0.003160,0.000165,0.000000,0.000000,0.022635,0.000103,0.022635,8


In [86]:
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

Topic #0: call,phone,numb,tell,ask,debt,say,time,not,company,contact,work,speak,receive,day,stop,message,leave,phone numb,information
Topic #1: loan,payment,pay,student,interest,student loan,rate,navient,year,month,time,interest rate,income,tell,not,school,plan,repayment,help,work
Topic #2: debt,collection,report,credit,company,owe,agency,account,credit report,letter,bill,send,collection agency,collect,receive,pay,information,validation,dispute,state
Topic #3: account,card,bank,check,charge,money,credit card,close,tell,credit,fee,receive,chase,day,fund,call,customer,transaction,open,say
Topic #4: payment,pay,late,month,mortgage,fee,loan,balance,account,charge,interest,car,time,insurance,tell,receive,statement,day,escrow,late fee
Topic #5: report,account,credit,information,credit report,dispute,remove,theft,file,verify,identity,bureau,item,request,identity theft,delete,experian,equifax,send,letter
Topic #6: mortgage,loan,document,modification,home,sale,property,file,bankruptcy,foreclosu

In [266]:
df.Product.value_counts()

Credit reporting, credit repair services, or other personal consumer reports    148512
Debt collection                                                                 95944 
Mortgage                                                                        57037 
Credit card or prepaid card                                                     46671 
Bank account or service                                                         30736 
Student loan                                                                    23414 
Consumer Loan                                                                   16310 
Money transfer, virtual currency, or money service                              8014  
Payday loan, title loan, or personal loan                                       7028  
Other financial service                                                         292   
Name: Product, dtype: int64

In [213]:
subset = df[df.Product == 'Credit card or prepaid card'].dropna(subset=['summ'])

## Let's do some TF-IDF and tokenization, and try LDA

In [228]:
subset.Issue.value_counts()

Problem with a purchase shown on your statement                                     5767
Other features, terms, or problems                                                  3792
Fees or interest                                                                    3601
Billing disputes                                                                    3102
Problem when making payments                                                        2741
Getting a credit card                                                               2157
Closing your account                                                                2104
Advertising and marketing, including promotional offers                             2033
Other                                                                               1940
Identity theft / Fraud / Embezzlement                                               1723
Closing/Cancelling account                                                          1440
Trouble using your ca

In [87]:
topics = lda.transform(X)
topics = pd.DataFrame(topics)

In [88]:
topics['max_prob'] = np.max(topics.values, axis=1)
topics['most_likely'] = np.argmax(topics.values, axis=1)

In [92]:
predicted_topics = pd.merge(df[['summ', 'Product']].reset_index().reset_index(), topics.reset_index(),  left_on='level_0', right_on='index')
predicted_topics.drop(columns=['level_0', 'index_x', 'index_y'], inplace=True)

In [93]:
predicted_topics['most_likely'].value_counts()

3    81025
5    64237
7    62437
4    57997
2    57653
0    39263
1    36946
6    34400
Name: most_likely, dtype: int64

In [118]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors

# Get topic weights

# Keep the well separated points (optional)
idx = list(predicted_topics.columns.values)[2:-2]

arr = predicted_topics[idx].sample(frac=0.025).values
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10798 samples in 0.005s...
[t-SNE] Computed neighbors for 10798 samples in 0.551s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10798
[t-SNE] Computed conditional probabilities for sample 2000 / 10798
[t-SNE] Computed conditional probabilities for sample 3000 / 10798
[t-SNE] Computed conditional probabilities for sample 4000 / 10798
[t-SNE] Computed conditional probabilities for sample 5000 / 10798
[t-SNE] Computed conditional probabilities for sample 6000 / 10798
[t-SNE] Computed conditional probabilities for sample 7000 / 10798
[t-SNE] Computed conditional probabilities for sample 8000 / 10798
[t-SNE] Computed conditional probabilities for sample 9000 / 10798
[t-SNE] Computed conditional probabilities for sample 10000 / 10798
[t-SNE] Computed conditional probabilities for sample 10798 / 10798
[t-SNE] Mean sigma: 0.001266
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.188637
[t-SNE] KL 

In [119]:
tsne_df = pd.DataFrame(tsne_lda)
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
color = [mycolors[i] for i in topic_num]
tsne_df['color'] = color
tsne_df['topic'] = topic_num
tsne_df.rename(columns={0: 'x', 1: 'y'}, inplace=True)

In [120]:
output_notebook()
n_topics = 10
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter("x", "y", source=tsne_df, legend="topic", color='color')
plot.legend.location = "top_right"
plot.legend.click_policy="hide"

show(plot)