In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import time
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt

import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
import nltk
from nltk.stem.snowball import SnowballStemmer

import spacy
nlp = spacy.load("en_core_web_sm")

In [8]:
stemmer = SnowballStemmer(language='english')

In [5]:
df = pd.read_csv('../data/for_issue_lda.csv')

In [30]:
df['cleaned'] = df['cleaned'].astype(str)

In [31]:
def stemming(doc):
    doc = nlp(doc, disable=['parser', 'tagger'])
    return ' '.join(' '.join(stemmer.stem(token.text) if not token.is_stop else '' for token in doc).split())

In [32]:
t0 = time.time()
stemmed = df['cleaned'].apply(stemming)
print('Total time = {}s'.format(time.time()-t0))

Total time = 4628.729872703552s


In [34]:
df['stemmed'] = stemmed

In [35]:
df.to_csv('../data/for_issue_lda_with_stemmed.csv', index=False)

In [36]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))

X = vectorizer.fit_transform(df['stemmed'])


In [37]:
from sklearn.decomposition import LatentDirichletAllocation

def run_lda_expt(X, n_topics):
    lda_model = LatentDirichletAllocation(n_components=n_topics, doc_topic_prior=0.1, 
                                          topic_word_prior=0.1, random_state=42, n_jobs=-1)
    lda_model.fit(X)
    # Perplexity
    print("Model perplexity: {}".format(lda_model.perplexity(X)))
    with open('../output/lda_stemmed_{}_topics.pickle'.format(n_topics), 'wb') as f:
        pickle.dump(lda_model, f, pickle.HIGHEST_PROTOCOL)

In [38]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ",".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [39]:
t0 = time.time()
run_lda_expt(X, 45)
print('Time taken: {}s'.format(time.time()-t0))

Model perplexity: 797.5221657453138
Time taken: 1511.2747757434845s


In [41]:
with open('../output/lda_stemmed_45_topics.pickle', 'rb') as f:
    lda_model = pickle.load(f)

In [42]:
n_top_words=10
count_feature_names = vectorizer.get_feature_names()
print_top_words(lda_model, count_feature_names, n_top_words)

Topic #0: report,credit,credit report,remov,transunion,remov credit,account,bureaus,disput,item
Topic #1: court,attorney,garnish,file,law,portfolio,recoveri,firm,case,judgement
Topic #2: proof,contract,sign,document,provid,copi,signatur,ask,acct,sent
Topic #3: money,account,transfer,fund,paypal,transact,day,told,access,bank
Topic #4: loan,student,student loan,school,educ,consolid,servic,privat,feder,defer
Topic #5: fee,late,day,time,late fee,month,payment,charg,pay,past
Topic #6: offer,card,purchas,point,cancel,servic,month,fee,term,receiv
Topic #7: loan,mortgag,modif,home,loan modif,foreclosur,servic,year,time,trial
Topic #8: loan,pay,nt,help,know,want,tri,got,need,money
Topic #9: inform,credit,report,experian,verifi,record,credit report,file,freez,public
Topic #10: paid,account,payment,credit,report,settlement,agre,settl,account paid,balanc
Topic #11: card,credit,credit card,appli,limit,line,card account,applic,credit limit,use
Topic #12: state,law,attorney,note,legal,violat,feder,fi