# Topic Tables
This notebook crosstabs topic predictions with each product's "Product" classification. We also show the most significant words in each model.

In [1]:
import numpy as np
import pandas as pd
from gensim import models



In [2]:
comp = pd.read_csv('../output/complaint-topics.csv')

#### Many product labels are very similar - we group where we believe appropriate.

In [3]:
productLookup = {
  'Bank account or service':'Bank Account',
  'Checking or savings account':'Bank Account',
  'Consumer Loan':'Consumer Loan',
  'Credit card':'Credit card',
  'Credit card or prepaid card':'Credit card',
  'Credit reporting':'Credit reporting',
  'Credit reporting, credit repair services, or other personal consumer reports':'Credit reporting',
  'Debt collection':'Debt collection',
  'Money transfer, virtual currency, or money service':'Money transfers',
  'Money transfers':'Money transfers',
  'Mortgage':'Mortgage',
  'Other financial service':'Other',
  'Payday loan':'Payday loan',
  'Payday loan, title loan, or personal loan':'Payday loan',
  'Prepaid card':'Credit card',
  'Student loan':'Student loan',
  'Vehicle loan or lease':'Vehicle loan or lease',
  'Virtual currency':'Money transfers'
}

In [4]:
comp['product_group'] = np.array([productLookup[s] for s in comp['Product']])

#### Tables showing text-to-product association.

In [5]:
pd.crosstab(comp.topic11,
            comp.product_group,
            rownames=["Predicted Topic"],
            colnames=["Product Group"],
            margins=True)

Product Group,Bank Account,Consumer Loan,Credit card,Credit reporting,Debt collection,Money transfers,Mortgage,Other,Payday loan,Student loan,Vehicle loan or lease,All
Predicted Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8576,204,1235,230,393,728,517,65,124,64,5,12141
1,346,2031,3089,1251,1182,17,7273,10,440,2374,147,18160
2,2048,151,8766,717,419,125,118,15,21,32,1,12413
3,2934,1441,4347,2664,6052,703,5227,91,394,1635,87,25575
4,50,194,315,1531,15077,4,142,10,67,162,2,17554
5,558,351,494,1366,2609,74,2501,15,60,159,28,8215
6,248,743,1367,11052,2360,12,754,7,59,344,16,16962
7,304,348,1026,18084,4200,12,243,3,52,174,13,24459
8,150,632,482,1118,9923,54,307,14,406,464,17,13567
9,108,262,59,455,446,5,11834,21,76,4301,4,17571


In [6]:
np.round(pd.crosstab(comp.topic11,
         comp.product_group,
         rownames=["Predicted Topic"],
         colnames=["Product Group"],
         normalize = 'index'), 3)

Product Group,Bank Account,Consumer Loan,Credit card,Credit reporting,Debt collection,Money transfers,Mortgage,Other,Payday loan,Student loan,Vehicle loan or lease
Predicted Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.706,0.017,0.102,0.019,0.032,0.06,0.043,0.005,0.01,0.005,0.0
1,0.019,0.112,0.17,0.069,0.065,0.001,0.4,0.001,0.024,0.131,0.008
2,0.165,0.012,0.706,0.058,0.034,0.01,0.01,0.001,0.002,0.003,0.0
3,0.115,0.056,0.17,0.104,0.237,0.027,0.204,0.004,0.015,0.064,0.003
4,0.003,0.011,0.018,0.087,0.859,0.0,0.008,0.001,0.004,0.009,0.0
5,0.068,0.043,0.06,0.166,0.318,0.009,0.304,0.002,0.007,0.019,0.003
6,0.015,0.044,0.081,0.652,0.139,0.001,0.044,0.0,0.003,0.02,0.001
7,0.012,0.014,0.042,0.739,0.172,0.0,0.01,0.0,0.002,0.007,0.001
8,0.011,0.047,0.036,0.082,0.731,0.004,0.023,0.001,0.03,0.034,0.001
9,0.006,0.015,0.003,0.026,0.025,0.0,0.673,0.001,0.004,0.245,0.0


#### Topic Words
Here are the most significant words associated with each topic.  From these, you can begin to get a "theme" of what each topic represents.

In [7]:
lda = models.LdaModel.load('../models/lda11')

In [8]:
def getWords(i):
    s = lda.show_topic(i)
    words = [w for (w, prob) in s]    
    return(np.array(words))

In [9]:
topic_words = pd.DataFrame(index=np.arange(1, 11))
for i in range(11):
    words = getWords(i)
    topic_words['Topic %2d' % i] = words
    
topic_words

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
1,bank,payment,card,call,debt,state,credit,report,call,loan,pay
2,account,late,charg,told,collect,law,report,account,number,mortgag,nt
3,check,pay,creditcard,would,owe,consum,remov,credit,phone,home,get
4,money,amount,account,receiv,account,file,score,inform,name,modif,year
5,chase,month,use,said,compani,complaint,bureau,disput,compani,servic,loan
6,fund,account,one,nt,agenc,provid,show,request,person,document,would
7,america,due,receiv,ask,receiv,court,account,remov,contact,foreclosur,help
8,deposit,paid,purchas,back,letter,document,year,file,time,request,time
9,fee,interest,capit,get,bill,violat,bankruptci,verifi,address,time,car
10,close,balanc,fee,could,valid,case,neg,letter,stop,payment,home
