# Topic modeling (Latent Dirichlet Allocation)

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import unidecode
%matplotlib inline
import copy
from time import time

In [2]:
from gensim import corpora,models,similarities
from gensim.utils import lemmatize
from string import punctuation
from spacy.parts_of_speech import ADV, NOUN, ADJ, PUNCT, VERB
from spacy.en import English,STOPWORDS
from spacy.orth import *
import logging
from collections import defaultdict

In [3]:
from HTMLParser import HTMLParser

##### The api key is ynw2u8e4h9sk8c2htp7vutxq

In [4]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

This notebook deals exclusively with LDA topic modeling, the previous steps such as tokenization, tf-idf are skipped, results are directly loaded from the other Ted_9 LSI notebook.

In [5]:
captions = pd.read_json('captions_f3.json')

In [6]:
ids = captions['id']

In [7]:
iddict = dict([(item[1],int(item[0])) for item in ids.iteritems()])
rowdict = dict([(int(item[0]),item[1]) for item in ids.iteritems()])

## 1. Load in dictionary and corpus

In [8]:
dictionary = corpora.Dictionary.load('./data/tedtrain.dict')

In [9]:
corpustrain = corpora.MmCorpus('./data/corpustrain.mm')

In [10]:
len(corpustrain)

1531

## 2. Latent Dirichlet Allocation 

#### Hyperparameter tuning 

Hyperparameters of LDA models were determined based on both human evaluation and log perpelexity of the hold-out set. First, a hyperparameter grid of {number of topics, alpha, eta} was set up and similar talks were drawn for two talks (ids 1666 and 129), for which I had obtained results using LSI and the results were validated. The similar talks given by LDA were compared with that given by LSI, and the LDA models that produced best matched similar talks were selected. Second, a holdout set (the test set used in the rating prediction task) is used to test the perpelexity and the best model was chosen. 

In [11]:
def give_similar_talks(talkid, num_of_talks,corpus,index):
    """
    return a list of tuples, (talkid, cosine similarity score in descending order)
    """
    sims = index[corpus[iddict[talkid]]]
    sims = sorted(enumerate(sims),reverse = True, key = lambda x:x[1])
    sims_id = [(rowdict[key],value) for key,value in sims]
    return sims_id[:num_of_talks]

In [12]:
def print_topic_doc(projection,ids,num_top_topics,talkid = True):
    """
    print the most relevant topics for the chosen documents
    either row id or talkid can be used as input
    """
    if talkid:
        rowids = [iddict[x] for x in ids]
    else:
        rowids = ids
    for ctr in xrange(len(rowids)):
        proj = projection[rowids[ctr]]
        proj.sort(reverse = True, key = lambda x:abs(x[1]))
        print 'The top {} topics of the {}th document'.format(num_top_topics,rowids[ctr])
        if talkid:
            print 'The talk id is {}'.format(ids[ctr])
        print proj[:num_top_topics]
        print

### Grid search hyperparameters

test the topic distribution by talks 1666 and 129, the former is an education talk the latter is a high-tech talk

In [16]:
alpha_list = [1,3,10,'symmetric','auto','asymmetric']

In [17]:
eta_list = [0.3,1,3,10,'auto']

In [18]:
num_topic_list = [18,20,25,30,40]

In [19]:
nested_list = [num_topic_list,alpha_list,eta_list]

In [20]:
from itertools import product

In [21]:
grid_list = list(product(*nested_list))

grid_list has 150 hyperparameters combination in total

In [22]:
len(grid_list)

150

set up two dictionary to store (hyparameter-combination, [similar talks]) pair for the two targeted talks

In [23]:
sim_talks_1666 = defaultdict(list)
sim_talks_129 = defaultdict(list)

In [25]:
for ind, comb in enumerate(grid_list):
    time0 = time()
    lda_trial = models.LdaModel(corpustrain,num_topics=comb[0],id2word = dictionary, alpha = comb[1],
                               eta = comb[2],chunksize = 200, iterations = 150,passes = 20)
    
    corpustrain_lda_tmp = lda_trial[corpustrain]
    index_tmp = similarities.MatrixSimilarity(corpustrain_lda_tmp)
    sim_talks_1666[comb] = give_similar_talks(1666,5,corpustrain_lda_tmp,index_tmp)
    sim_talks_129[comb] = give_similar_talks(129,5,corpustrain_lda_tmp,index_tmp)
    print 'finishing the {}th combination in {} sec'.format(ind,time()-time0)

finishing the 0th combination in 91.8138279915 sec
finishing the 1th combination in 101.844889879 sec
finishing the 2th combination in 101.399289131 sec
finishing the 3th combination in 110.304780006 sec
finishing the 4th combination in 98.7791631222 sec
finishing the 5th combination in 78.1259589195 sec
finishing the 6th combination in 84.3467669487 sec
finishing the 7th combination in 92.0698041916 sec
finishing the 8th combination in 80.6833298206 sec
finishing the 9th combination in 86.8609139919 sec
finishing the 10th combination in 70.6086568832 sec
finishing the 11th combination in 74.3404428959 sec
finishing the 12th combination in 75.9294681549 sec
finishing the 13th combination in 72.9085729122 sec
finishing the 14th combination in 86.2294661999 sec
finishing the 15th combination in 114.409653902 sec
finishing the 16th combination in 115.273396969 sec
finishing the 17th combination in 123.436055183 sec
finishing the 18th combination in 115.410548925 sec
finishing the 19th com

In [21]:
sim_talks_1666[(30,'asymmetric','auto')]

[(1666, 1.0),
 (1403, 0.96541113),
 (1136, 0.93835831),
 (1672, 0.9308055),
 (1954, 0.93054229)]

In [35]:
sim_talks_129[(18,1,0.3)]

[(129, 1.0),
 (766, 0.95296592),
 (481, 0.94456315),
 (1841, 0.93380558),
 (1244, 0.93007463)]

In [69]:
set(sim_talks_129.keys()).difference(set(sim_talks_1666.keys()))

{(10, 1, 0.3)}

#### save the two defaultdicts: sim_talks_1666 and sim_talks_129 as pickle files
sim_talks_129 has one extra key (10,1,0.3) which is not useful

In [13]:
import pickle

In [65]:
pickle.dump(sim_talks_1666,open('./data/sim_talks_1666.pkl','wb'))

In [69]:
pickle.dump(sim_talks_129,open('./data/sim_talks_129.pkl','wb'))

In [14]:
sim_talks_1666 = pickle.load(open('./data/sim_talks_1666.pkl','rb'))

In [15]:
sim_talks_129 = pickle.load(open('./data/sim_talks_129.pkl','rb'))

#### For both talks, select the combination of hyperparameters that generate best matched similar talks compared with the results given by LSI
The LSI results were generated in Ted_9_topic_modeling_LSI

For 1666, the top-5 returned similar talks from LSI (num_topics = 20, 50) are [1672,1954,1403,930,1032]

For 129, the top-6 returned similar talks from LSI (num_topics = 20,50 ) are [766,139,210,481,1902,826]

In [22]:
sim_set_1666 = {1672,1954,1403,930,1032,1666}
sim_set_129 = {766,139,210,481,1902,826,129}
# the first-pass solution does not take into account the relative ranking of the similar talks in these two sets,
# only concerns the set difference, related to Jaccard distance
res_1666 = []
for key,value in sim_talks_1666.iteritems():
    sim_talks_lda = set([x[0] for x in value])
    shared_talks = sim_talks_lda.intersection(sim_set_1666)
    res_1666.append((key,len(shared_talks)))
res_1666.sort(reverse = True,key = lambda x:x[1])

In [23]:
res_129 = []
for key,value in sim_talks_129.iteritems():
    sim_talks_lda = set([x[0] for x in value])
    shared_talks = sim_talks_lda.intersection(sim_set_129)
    res_129.append((key,len(shared_talks)))
res_129.sort(reverse = True,key = lambda x:x[1])

Select LDA models that give consistent good similarity predictions for the two talks compared to the results generated by LSI. For talk 1666, the prediction should have at least 4 talks (including the talk itself) in common with the LSI prediction; for the talk 129, the prediction should have at least 3 talks (including the talk itself) in common with the LSI prediction.

In [119]:
sel_res_1666 = []
sel_res_129 = []
for i in xrange(150):
    if res_1666[i][1] >= 4:
        sel_res_1666.append(res_1666[i])
    if res_129[i][1] >= 3:
        sel_res_129.append(res_129[i])
set_1666 = set(dict(sel_res_1666).keys())
set_129 = set(dict(sel_res_129).keys())
comm_set = set_1666.intersection(set_129)
# combine the key in comm_set with the num of matched similar talks
comm_res = [(key,(dict(sel_res_1666)[key],dict(sel_res_129)[key])) for key in comm_set]

In [120]:
comm_set

{(25, 3, 'auto'),
 (30, 1, 0.3),
 (30, 'asymmetric', 'auto'),
 (40, 3, 0.3),
 (40, 'asymmetric', 'auto')}

### rerun those selected models with higher number of iterations and record its log-perplexity change

In [104]:
corpuslist = []
ctr = 0
for key in comm_set:
    time0 = time()
    lda_tmp = models.LdaModel(corpustrain,num_topics=key[0],id2word = dictionary, alpha = key[1],
                               eta = key[2],chunksize = 200, iterations = 200,passes = 20)
    corpus_lda_tmp = lda_tmp[corpustrain]
    corpuslist.append((key,[lda_tmp,corpus_lda_tmp]))
    print 'finishing up the {}th key calculation in {} seconds'.format(ctr,time()-time0)
    print 'the last evaluated log-perplexity is {:.3f}'.format(lda_tmp.log_perplexity(corpustrain[1400:]))
    ctr += 1

finishing up the 0th key calculation in 95.7564558983 seconds
the last evaluated log-perplexity is -16.898
finishing up the 1th key calculation in 119.44173193 seconds
the last evaluated log-perplexity is -10.270
finishing up the 2th key calculation in 128.505738974 seconds
the last evaluated log-perplexity is -10.100
finishing up the 3th key calculation in 83.674382925 seconds
the last evaluated log-perplexity is -20.446
finishing up the 4th key calculation in 86.5635120869 seconds
the last evaluated log-perplexity is -10.374


In [122]:
corpuslist = corpuslist[:-1]

In [124]:
time0 = time()
lda_tmp = models.LdaModel(corpustrain,num_topics=25,id2word = dictionary, alpha = 3,
                           eta = 'auto',chunksize = 200, iterations = 600,passes = 20)
corpus_lda_tmp = lda_tmp[corpustrain]
corpuslist.append(((25,3,'auto'),[lda_tmp,corpus_lda_tmp]))
print 'finishing up the {}th key calculation in {} seconds'.format(ctr,time()-time0)
print 'the last evaluated log-perplexity is {:.3f}'.format(lda_tmp.log_perplexity(corpustrain[1400:]))

finishing up the 5th key calculation in 90.1669118404 seconds
the last evaluated log-perplexity is -10.323


### Summary:
    The better models are (25, 3,'auto'),(30,'asymmetric','auto') and (40,'asymmetric','auto') according to the internal log-perplexity calculation conducted during model training using an internal holdout set
    Due to the smaller size, I finally picked the model with 25 topics for visualization purpose.

### Save the models, the transformed corpus and the data frames
This is for the purpose of visualization and future follow-up tasks such as prediction, therefore the model is only trained with the training set. The results were saved in the r2py folder, was originally intended for the dirichlet regression rate prediciton using R

In [126]:
for item in corpuslist:
    key = item[0]
    keystr = '_'.join([str(x) for x in key])
    lda_tmp,corpus_tmp = item[1][0],item[1][1]
    topic_df = pd.DataFrame([dict(x) for x in corpus_tmp])
    corpuspath = './r2py/corpustrain_lda_'+ keystr + '.mm'
    modelpath = './r2py/lda_'+keystr+'.model'
    dfpath = './r2py/ldafeatures_'+keystr+'.csv'
    corpora.MmCorpus.serialize(corpuspath,corpus_tmp)
    lda_tmp.save(modelpath)
    topic_df.to_csv(dfpath,index = False)

### Final look at the similar talks given by each model for talks 1666 and 129 

In [127]:
lda_tmp = models.LdaModel.load('./r2py/lda_25_3_auto.model')

In [128]:
corpus_tmp = corpora.MmCorpus('./r2py/corpustrain_lda_25_3_auto.mm')

In [129]:
index_tmp = similarities.MatrixSimilarity(corpus_tmp)

In [135]:
sim1666 = give_similar_talks(1666,10,corpus_tmp,index_tmp)

In [131]:
sim129 = give_similar_talks(129,10,corpus_tmp,index_tmp)

In [136]:
sim1666

[(1666, 1.0),
 (1403, 0.99174792),
 (1672, 0.9789983),
 (1954, 0.95567638),
 (1033, 0.95540977),
 (297, 0.95210242),
 (1248, 0.95129913),
 (1040, 0.95107841),
 (1596, 0.95061839),
 (2074, 0.93506551)]

In [133]:
sim129

[(129, 1.0),
 (139, 0.93286169),
 (766, 0.9226687),
 (1366, 0.89578968),
 (481, 0.87153292),
 (1266, 0.86575675),
 (1515, 0.8590467),
 (1630, 0.85715783),
 (937, 0.85688257),
 (1958, 0.8541826)]

### Filter identical topics  

There are possibly some extremely weak topics (e.g., with all-zero word-topic distribution) in some of these models, for the purpose of being used as feature columns in future prediction model (e.g., prediction of ratings), those topic columns (likely to be colinear) need to be filtered out and keep only one such topic. Use correlation matrix to detect such columns and save the filtered document-topic dataframe

In [45]:
def filtertopic(topic_df):
    topic_df.fillna(0,inplace = True)
    corrmatrix = np.round(np.array(topic_df.corr()),decimals = 4)
    row,col = np.where(corrmatrix == 1)
    duplicated = set([tp[1] for tp in zip(row,col) if tp[0] < tp[1]])
    filtered_tp_df = topic_df.drop(duplicated,axis = 1)
    return filtered_tp_df

In [46]:
for item in corpuslist:
    key = item[0]
    keystr = '_'.join([str(x) for x in key])
    corpus_tmp = item[1][1]
    tp_df = pd.DataFrame([dict(x) for x in corpus_tmp])
    ftp_df = filtertopic(tp_df)
    dfpath = './r2py/fldafeatures_'+keystr+'.csv'
    ftp_df.to_csv(dfpath,index = False)