# Topic modeling

This notebook does the LSI topic modeling on the training data set, number of topics is the key parameter and was varied and different models and corpuses were saved for prediction of ratings

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import re
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import unidecode
%matplotlib inline
import copy
from time import time

In [2]:
import scipy.stats as ss
import scipy.sparse as sp
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from gensim import corpora,models,similarities
from gensim.utils import lemmatize
from string import punctuation
from spacy.parts_of_speech import ADV, NOUN, ADJ, PUNCT, VERB
from spacy.en import English,STOPWORDS
from spacy.orth import *
import logging
from collections import defaultdict

In [4]:
from HTMLParser import HTMLParser

## 1. Load in caption and ratings 

### 1.1 Load in captions 

In [13]:
captions = pd.read_json('captions_f.json')

In [14]:
captions.sort_values(by = 'id',inplace = True)

In [15]:
captions.reset_index(inplace = True, drop = True)

In [16]:
captions.columns

Index([u'And_so_total', u'app_as_para', u'app_total_log', u'applause_per_word',
       u'applause_total', u'div_per_word', u'diversion', u'funny_binary',
       u'id', u'ithinks', u'label', u'laugh_as_para', u'laughter_per_word',
       u'laughter_total', u'length', u'neg_binary', u'noun_token', u'num_para',
       u'para_length', u'stopwords_ratio', u'storywords', u'text',
       u'verb_token', u'word_per_sec', u'wordcounts'],
      dtype='object')

In [17]:
captions.shape

(1535, 25)

### 1.2 Drop the talks that have essentially no captions
These talks are musical performances but have the transcript field filled in the online database. to_drop_ind = [99,1156,1464,2147], they have only place-holders such as (applause) etc in the caption field

##### get the row indices of these four talks

In [18]:
to_drop_ind = [99,1156,1464,2147]

In [19]:
to_drop_rows = [captions[captions.id == x].index.values[0] for x in to_drop_ind]

In [20]:
to_drop_rows

[75, 736, 948, 1460]

In [21]:
captions.drop(captions.index[to_drop_rows],inplace = True)

In [22]:
captions.reset_index(inplace = True, drop = True)

##### save to json file captions_f3.json

In [16]:
captions.to_json('captions_f3.json')

##### make two dictionaries: iddict {id: row_number} and rowdict (row_num: id} to convert between row and talkids

In [6]:
ids = captions.id

In [11]:
iddict = dict([(item[1],int(item[0])) for item in ids.iteritems()])

In [12]:
rowdict = dict([(item[0],item[1]) for item in ids.iteritems()])

## 2. Tokenize the text with Spacy

### 2.1 remove stopwords, POS tagging and lemmatize, keep only nouns and verbs

The advantages of spaCy over NLTK are obvious, speed in tagging, and also spaCy accomplishes lemma, tagging altogether in one operation

In [74]:
nlp = English()

In [75]:
def tokenize(text,islist = True):
    """
    This converts text to tokens, keeps only tokens 
    that are identified using coarse pos: noun and verb
    in: text -- default to be list of lines of transcript
        can also be a string
    www.ling.uppen.edu/courses/Fall_2003/ling001/penn_treebank_pos
    out:
        list of tokens
    """
    if islist:
        text = ' '.join(text)
    # remove '(Laughter)' etc, 
    text = re.sub('\([\w\s]+\)','',text)
    doc = nlp(text, parse = False)
    words = []
    for token in doc:
        # remove all tokens other than verbs and nouns
        if token.pos == NOUN or token.pos == VERB:
        # if token.tag_ in selected_tags:
            words.append(token)
    return words

In [32]:
tokens = captions.text.apply(tokenize)

In [33]:
tokens.shape

(1531,)

In [76]:
def get_lemma(token_list):
    # remove stopwords
    lemmalist = []
    for token in token_list:
        if token.lemma_ not in STOPWORDS and len(token.lemma_) > 1:
            lemmalist.append(token.lemma_)
    return lemmalist

In [35]:
tokens_final = tokens.apply(get_lemma)

In [36]:
bigram = models.Phrases(tokens_final)

##### save the bigram object for the transformation on the test set 

In [37]:
bigram.save('./data/bigram')

In [38]:
bigram = models.Phrases.load('./data/bigram')

In [39]:
token_phrase_final = list(tokens_final.apply(lambda x:bigram[x]))

## 3 Gensim

### 3.1 generate dictionary 

In [40]:
dictionary = corpora.Dictionary(token_phrase_final)

##### filter out low and high frequency tokens using dictionary of gensim, tuning is not done yet on the threholds

In [41]:
dictionary.filter_extremes(no_below = 10, no_above = 0.5)

In [42]:
dictionary.save('./data/tedtrain.dict')

In [43]:
print(dictionary)

Dictionary(6129 unique tokens: [u'yellow', u'circuitry', u'centimeter', u'aggression', u'payoff']...)


In [44]:
# to see mapping between words and their ids
# print(dictionary.token2id)

##### load dictionary 

In [15]:
dictionary = corpora.Dictionary.load('./data/tedtrain.dict')

### 3.2 generate corpus using Bag of Words

Corpus is generated using dictionary.doc2bow(text), each output of this function is a list of tuples, showing the (word id,number of occurence) in the input text.

In [46]:
corpustrain = [dictionary.doc2bow(text) for text in token_phrase_final]

##### save the corpus to disk 

In [47]:
corpora.MmCorpus.serialize('./data/corpustrain.mm',corpustrain)

### 3.3 Tf-idf conversion 

training the model

In [126]:
tfidf = models.TfidfModel(corpustrain)

transform the corpus using tfidf

In [127]:
corpustrain_idf = tfidf[corpustrain]

#### save both the transformed corpus and the model 

In [128]:
tfidf.save('./data/tfidf.model')

In [129]:
corpora.MmCorpus.serialize('./data/corpustrain_idf.mm',corpustrain_idf)

#####  load the tfidf corpora

In [26]:
corpustrain_idf = corpora.MmCorpus('./data/corpustrain_idf.mm')

### 3.4 LSI model 

The saved model and corpus are trained using 50 topics and power_iters = 50, different num of topics (20, 100, 150) are tested afterwards and evaluated by manually checking the similarity of extracted similar documents

In [7]:
lsi = models.LsiModel(corpustrain_idf,id2word=dictionary,num_topics=50,onepass=False,power_iters=50)

In [132]:
# transform corpus
corpustrain_lsi = lsi[corpustrain_idf]

In [90]:
def print_topic_doc(projection,ids,num_top_topics,talkid = True):
    """
    print the most relevant topics for the chosen documents
    either row id or talkid can be used as input
    """
    if talkid:
        rowids = [iddict[x] for x in ids]
    else:
        rowids = ids
    for ctr in xrange(len(rowids)):
        proj = projection[rowids[ctr]]
        proj.sort(reverse = True, key = lambda x:abs(x[1]))
        print 'The top {} topics of the {}th document'.format(num_top_topics,rowids[ctr])
        if talkid:
            print 'The talk id is {}'.format(ids[ctr])
        print proj[:num_top_topics]
        print 

In [134]:
print_topic_doc(corpustrain_lsi,[704,1666,1672],5,talkid = True)

The top 5 topics of the 451th document
The talk id is 704
[(0, 0.20301381285420877), (1, 0.17764133715921016), (12, -0.11135815262498935), (33, 0.10613426473345948), (4, 0.08559800199831194)]

The top 5 topics of the 1087th document
The talk id is 1666
[(1, 0.24595700249439217), (0, 0.18916414006982193), (12, -0.11931160688132941), (33, 0.11351690565191623), (7, -0.10664128738445322)]

The top 5 topics of the 1091th document
The talk id is 1672
[(1, 0.28528924821672114), (0, 0.26398507354989315), (12, -0.22064837724483552), (33, 0.2114417701333349), (7, -0.15362329281039333)]



#### test similarity measurements 

In [10]:
index = similarities.MatrixSimilarity(corpustrain_lsi)

##### save the index

In [None]:
index.save('./data/trainsim.index')

In [19]:
index = similarities.MatrixSimilarity.load('./data/trainsim.index')

In [98]:
def give_similar_talks(talkid, num_of_talks,corpus,index):
    """
    return a list of tuples, (talkid, cosine similarity score in descending order)
    """
    sims = index[corpus[iddict[talkid]]]
    sims = sorted(enumerate(sims),reverse = True, key = lambda x:x[1])
    sims_id = [(rowdict[key],value) for key,value in sims if rowdict[key] != talkid]
    return sims_id[:num_of_talks]

In [99]:
simsids = give_similar_talks(1666,5,corpustrain_lsi,index)

In [100]:
simsids

[(1672, 0.9052946),
 (1954, 0.90312833),
 (1403, 0.86130905),
 (930, 0.78333473),
 (751, 0.77572221)]

#### save model and transformed corpus 

In [142]:
lsi.save('./data/lsi.model')

In [143]:
corpora.MmCorpus.serialize('./data/corpustrain_lsi.mm',corpustrain_lsi)

##### load the model and corpus of LSI 

In [17]:
corpustrain_lsi = corpora.MmCorpus('./data/corpustrain_lsi.mm')

In [18]:
lsi = models.LsiModel.load('./data/lsi.model')

### LSI with varied number of topics for later rating prediction 

The trend is clear, with increased number of topics, cosine similarity between talks drops due to higher dimension, also, a coherent topic such as "Afghanistan, women, education" can be separated into "Afghanistan" and "women, education", therefore, similar in one topic dimension versus another happens more often. For now, set number to 50 (topics), seems to work very well by manually comparing the talks.

In [38]:
topicsnos = [20,25,30,40]

In [39]:
simtalks1666 = defaultdict(list)
simtalks129 = defaultdict(list)

In [40]:
corpuslist = []
lsimodellist = []
for i in xrange(len(topicsnos)):
    num = topicsnos[i]
    %time lsitmp = models.LsiModel(corpustrain_idf,id2word=dictionary,num_topics=num,onepass=False,power_iters=50)
    corpus_tmp = lsitmp[corpustrain_idf]
    corpuslist.append(corpus_tmp)
    lsimodellist.append(lsitmp)
    index_tmp = similarities.MatrixSimilarity(corpus_tmp)
    simsid129 = give_similar_talks(129,5,corpus_tmp,index_tmp)
    simsid1666 = give_similar_talks(1666,5,corpus_tmp,index_tmp)
    simtalks1666[i] = simsid1666
    simtalks129[i] = simsid129

CPU times: user 2min 17s, sys: 1.46 s, total: 2min 19s
Wall time: 1min 30s
CPU times: user 2min 24s, sys: 3.36 s, total: 2min 27s
Wall time: 1min 41s
CPU times: user 2min 25s, sys: 3.07 s, total: 2min 28s
Wall time: 1min 41s
CPU times: user 2min 25s, sys: 3.04 s, total: 2min 28s
Wall time: 1min 41s


##### save all models for  rating prediction

In [48]:
for i in xrange(4):
    lsi_tmp = lsimodellist[i]
    corpustrain_tmp = corpuslist[i]
    modelpath = './data/lsi_'+str(topicsnos[i])+'.model'
    corpuspath = './data/corpustrain_lsi_'+str(topicsnos[i])+'.mm'
    lsi_tmp.save(modelpath)
    corpora.MmCorpus.serialize(corpuspath,corpustrain_tmp)

## 4 Topic modeling on the speakers background info 

### 4.1 Construct speaker_background df 

speaker background info has actually not been used so far except extration of gender and one_speaker features. The description texts (why listen + whotheyare + description) will be used here to profile speakers and test their similarities.
speaker info will be queried using joined train3.json and speakers.json, speakerids are from train3.json (later also for test2.json) then, texts will be found using speakerid in the speakers.json dataframe.

In [57]:
speakers = pd.read_json('speakers.json')

In [58]:
speakers.sort_values(by = 'id',inplace = True)
speakers.set_index('id',inplace = True)

In [59]:
# initialize parser as an object
parser = HTMLParser()

In [60]:
# define a function to get speakers text info given speakers ids
def speakerinfo(ids):
    # ids is a list of ids, could be multiple speakers
    background = ''
    for speakerid in ids:
        text = ' '.join([speakers.ix[speakerid,'whylisten'], speakers.ix[speakerid,'whotheyare'], 
                speakers.ix[speakerid,'description']])
        text = parser.unescape(text)
        background += text
    return background

In [62]:
train = pd.read_json('train3.json')

In [67]:
train.sort_values(by = 'id',inplace=True)

In [68]:
train.reset_index(inplace=True,drop = True)

In [69]:
background = train['speaker_ids'].apply(speakerinfo)

In [70]:
speaker_background = pd.DataFrame({'talk_id':train['id'],'background':background})

### 4.2  Tokenize the background text with Spacy

In [77]:
sp_tokens = background.apply(tokenize,islist = False)

In [78]:
sp_tokens_final = sp_tokens.apply(get_lemma)

In [79]:
sp_bigram = models.Phrases(sp_tokens_final)

In [164]:
# sp_bigram.save('./data/spbigram')

In [80]:
sp_phrase_final = list(sp_tokens_final.apply(lambda x:sp_bigram[x]))

### 4.3 Gensim: bag-of-words, Tf-idf, LSI 

##### generate dictionary

In [81]:
sp_dictionary = corpora.Dictionary(sp_phrase_final)

filter out extreme tokens

In [82]:
sp_dictionary.filter_extremes(no_below = 5, no_above= 0.5)

In [83]:
print(sp_dictionary)

Dictionary(3517 unique tokens: [u'similarity', u'consumer_product', u'dynamic', u'protest', u'circuitry']...)


##### bag-of-word corpus 

In [84]:
sp_corpustrain = [sp_dictionary.doc2bow(text) for text in sp_phrase_final]

##### Tf-idf 

In [85]:
sp_tfidf = models.TfidfModel(sp_corpustrain)

In [86]:
sp_corpustrain_idf = sp_tfidf[sp_corpustrain]

### LSI with 50 topics

In [87]:
sp_lsi = models.LsiModel(sp_corpustrain_idf,num_topics=50,id2word=sp_dictionary,onepass=False,power_iters=50)

In [88]:
sp_corpustrain_lsi = sp_lsi[sp_corpustrain_idf]

In [174]:
# sp_lsi = models.LsiModel.load('./data/sp_lsi.model')

In [91]:
print_topic_doc(sp_corpustrain_lsi,[1666,1672],10,talkid = True)

The top 10 topics of the 1087th document
The talk id is 1666
[(0, 0.19789349354688271), (11, 0.18850143764664773), (3, 0.1763222265014539), (7, -0.17021699336657289), (2, 0.16229029980375462), (1, -0.11404248218845903), (5, 0.10910371094620626), (10, -0.091794627838949816), (6, -0.088572105022443767), (8, -0.087870993730281718)]

The top 10 topics of the 1091th document
The talk id is 1672
[(7, -0.15442791297992023), (0, 0.14389649481781347), (11, 0.14036567467480679), (2, 0.12849015123931395), (16, -0.097773306861731774), (10, -0.094912212800507254), (20, 0.088718770404556474), (25, -0.074969210588709936), (1, -0.07407779413334907), (37, 0.07393693318155832)]



###### example of similar speaker profile extraction 

In [92]:
sp_index = similarities.MatrixSimilarity(sp_corpustrain_lsi)

In [97]:
sp_index.save('./data/sp_trainsim.index')

In [49]:
sp_index = similarities.MatrixSimilarity.load('./data/sp_trainsim.index')

In [105]:
sp_simlist = give_similar_talks(152,5,sp_corpustrain_lsi,sp_index)

In [106]:
sp_simlist

[(127, 1.0),
 (1321, 0.85291243),
 (1851, 0.81708348),
 (1842, 0.80890816),
 (2105, 0.77260721)]

#### save all models and corpus of speaker background 

In [107]:
sp_lsi.save('./data/sp_lsi.model')

In [108]:
corpora.MmCorpus.serialize('./data/sp_corpustrain_lsi.mm',sp_corpustrain_lsi)

In [109]:
sp_tfidf.save('./data/sp_tfidf.model')

In [110]:
corpora.MmCorpus.serialize('./data/sp_corpustrain_idf.mm',sp_corpustrain_idf)

In [111]:
sp_dictionary.save('./data/sp_train.dict')

In [112]:
corpora.MmCorpus.serialize('./data/sp_tedtrain.mm',sp_corpustrain)

#### Speaker topic modeling LSI with varied number of topics 

In [113]:
sp_simtalks1666 = defaultdict(list)
sp_simtalks129 = defaultdict(list)

In [114]:
sp_corpuslist = []
sp_lsimodellist = []
for i in xrange(len(topicsnos)):
    num = topicsnos[i]
    %time sp_lsitmp = models.LsiModel(sp_corpustrain_idf,id2word=sp_dictionary,num_topics=num,onepass=False,power_iters=50)
    sp_corpus_tmp = sp_lsitmp[sp_corpustrain_idf]
    sp_corpuslist.append(sp_corpus_tmp)
    sp_lsimodellist.append(sp_lsitmp)
    sp_index_tmp = similarities.MatrixSimilarity(sp_corpus_tmp)
    sp_simsid129 = give_similar_talks(129,5,sp_corpus_tmp,sp_index_tmp)
    sp_simsid1666 = give_similar_talks(1666,5,sp_corpus_tmp,sp_index_tmp)
    sp_simtalks1666[i] = sp_simsid1666
    sp_simtalks129[i] = sp_simsid129

CPU times: user 32.5 s, sys: 579 ms, total: 33.1 s
Wall time: 8.49 s
CPU times: user 32.6 s, sys: 505 ms, total: 33.2 s
Wall time: 8.37 s
CPU times: user 32.4 s, sys: 1.26 s, total: 33.6 s
Wall time: 8.75 s
CPU times: user 35.2 s, sys: 1.2 s, total: 36.4 s
Wall time: 9.5 s


##### save all models for rating predictions 

In [115]:
for i in xrange(4):
    sp_lsi_tmp = sp_lsimodellist[i]
    sp_corpustrain_tmp = sp_corpuslist[i]
    modelpath = './data/sp_lsi_'+str(topicsnos[i])+'.model'
    corpuspath = './data/sp_corpustrain_lsi_'+str(topicsnos[i])+'.mm'
    sp_lsi_tmp.save(modelpath)
    corpora.MmCorpus.serialize(corpuspath,sp_corpustrain_tmp)

### Testing of transcripts and speaker background topic modeling (similarities) 

#### Test speaker similarities

In [124]:
sp_corpustrain_lsi = corpora.MmCorpus('./data/sp_corpustrain_lsi.mm')

In [125]:
sp_index = similarities.MatrixSimilarity(sp_corpustrain_lsi)

give a talk id, find the most similar speakers to the speaker of this talk

In [126]:
sp_simlist = give_similar_talks(1666,5,sp_corpustrain_lsi,sp_index)

In [127]:
sp_simlist

[(1672, 0.92408538),
 (1248, 0.81673884),
 (1954, 0.80270743),
 (1031, 0.79996693),
 (1646, 0.74974966)]

#### Test transcripts similarities 

In [128]:
corpustrain_lsi = corpora.MmCorpus('./data/corpustrain_lsi.mm')

In [129]:
index = similarities.MatrixSimilarity(corpustrain_lsi)

In [130]:
simlist = give_similar_talks(152,5,corpustrain_lsi,index)

In [131]:
simlist

[(127, 0.91849649),
 (91, 0.88651842),
 (157, 0.85837495),
 (330, 0.84537607),
 (2114, 0.83207947)]

## Summary:
1. Topic modeling is done with LSI, the results are so far satisfactory, especially with the transcripts. The draw back is the interpretation
2. The results are saved in the ./data folder, both the models and the transformed corpuses, should be able to directly load in for rating prediction
3. Models and corpuses with different number of topics (20,25,30,40,50) were saved and can be used as hyparameter to tune when doing k-nn rating prediction
3. Test set transcripts and speaker backgrounds should be transformed using the trained models