# Topic Tables
This notebook crosstabs topic predictions with each product's "Product" classification. We also show the most significant words in each model.

In [1]:
import numpy as np
import pandas as pd
from gensim import models



In [2]:
comp = pd.read_csv('../output/complaint-topics.csv')

#### Many product labels are very similar - we group where we believe appropriate.

In [3]:
productLookup = {
  'Bank account or service':'Bank Account',
  'Checking or savings account':'Bank Account',
  'Consumer Loan':'Consumer Loan',
  'Credit card':'Credit card',
  'Credit card or prepaid card':'Credit card',
  'Credit reporting':'Credit reporting',
  'Credit reporting, credit repair services, or other personal consumer reports':'Credit reporting',
  'Debt collection':'Debt collection',
  'Money transfer, virtual currency, or money service':'Money transfers',
  'Money transfers':'Money transfers',
  'Mortgage':'Mortgage',
  'Other financial service':'Other',
  'Payday loan':'Payday loan',
  'Payday loan, title loan, or personal loan':'Payday loan',
  'Prepaid card':'Credit card',
  'Student loan':'Student loan',
  'Vehicle loan or lease':'Vehicle loan or lease',
  'Virtual currency':'Money transfers'
}

In [4]:
comp['product_group'] = np.array([productLookup[s] for s in comp['Product']])

#### Tables showing text-to-product association.

In [5]:
pd.crosstab(comp.topic11,
            comp.product_group,
            rownames=["Predicted Topic"],
            colnames=["Product Group"],
            margins=True)

Product Group,Bank Account,Consumer Loan,Credit card,Credit reporting,Debt collection,Money transfers,Mortgage,Other,Payday loan,Student loan,Vehicle loan or lease,All
Predicted Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,444,284,395,1033,2433,47,1643,11,55,161,25,6531
1,1516,3237,2091,2909,4518,265,4650,52,484,1865,229,21816
2,193,892,1437,12349,4066,4,950,7,48,356,34,20336
3,1113,218,2792,505,719,429,359,31,32,42,13,6253
4,215,344,63,410,372,17,19103,8,37,152,13,20734
5,230,1970,2291,863,1075,13,4252,40,640,8396,126,19896
6,407,413,1334,18862,4263,21,255,4,72,145,14,25790
7,51,219,285,1192,15483,6,130,11,159,201,4,17741
8,1676,1514,3515,1496,11797,443,3328,56,498,1116,65,25504
9,8923,251,1501,213,390,609,642,65,132,52,11,12789


In [9]:
np.round(pd.crosstab(comp.topic11,
         comp.product_group,
         rownames=["Predicted Topic"],
         colnames=["Product Group"],
         normalize = 'index'), 3)

Product Group,Bank Account,Consumer Loan,Credit card,Credit reporting,Debt collection,Money transfers,Mortgage,Other,Payday loan,Student loan,Vehicle loan or lease
Predicted Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.068,0.043,0.06,0.158,0.373,0.007,0.252,0.002,0.008,0.025,0.004
1,0.069,0.148,0.096,0.133,0.207,0.012,0.213,0.002,0.022,0.085,0.01
2,0.009,0.044,0.071,0.607,0.2,0.0,0.047,0.0,0.002,0.018,0.002
3,0.178,0.035,0.447,0.081,0.115,0.069,0.057,0.005,0.005,0.007,0.002
4,0.01,0.017,0.003,0.02,0.018,0.001,0.921,0.0,0.002,0.007,0.001
5,0.012,0.099,0.115,0.043,0.054,0.001,0.214,0.002,0.032,0.422,0.006
6,0.016,0.016,0.052,0.731,0.165,0.001,0.01,0.0,0.003,0.006,0.001
7,0.003,0.012,0.016,0.067,0.873,0.0,0.007,0.001,0.009,0.011,0.0
8,0.066,0.059,0.138,0.059,0.463,0.017,0.13,0.002,0.02,0.044,0.003
9,0.698,0.02,0.117,0.017,0.03,0.048,0.05,0.005,0.01,0.004,0.001


#### Topic Words
Here are the most significant words associated with each topic.  From these, you can begin to get a "theme" of what each topic represents.

In [10]:
lda = models.LdaModel.load('../models/lda11')

In [11]:
def getWords(i):
    s = lda.show_topic(i)
    words = [w for (w, prob) in s]    
    return(np.array(words))

In [12]:
topic_words = pd.DataFrame(index=np.arange(1, 11))
for i in range(11):
    words = getWords(i)
    topic_words['Topic %2d' % i] = words
    
topic_words

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
1,mortgag,get,debt,loan,payment,account,call,card,report,law,home
2,loan,nt,collect,payment,late,bank,told,chase,credit,court,bank
3,modif,pay,letter,interest,balanc,check,phone,creditcard,account,file,loan
4,servic,year,compani,rate,account,money,would,charg,inform,consum,us
5,request,tri,account,navient,paid,charg,receiv,servic,remov,state,mortgag
6,letter,time,valid,pay,amount,fee,ask,use,disput,violat,close
7,receiv,money,receiv,studentloan,month,fund,number,receiv,bureau,complaint,sale
8,well,compani,owe,year,due,close,inform,credit,file,case,properti
9,document,go,agenc,would,pay,deposit,said,custom,inquiri,legal,hous
10,escrow,help,sent,incom,fee,transact,contact,purchas,request,attorney,would
