# Playing with the gensim coherence measures

## Introduction

Testing out difference 'coherence' pipeline combinations.
Topic models: LDA, NNMF, LSA, PCA + varimax

In [1]:
%load_ext autoreload
%autoreload 2
import logging
import warnings
import numpy as np
import pandas as pd
import io
from process_topics import show_topic_words, run_all, coherence_widget, NewCoherence, coherence_scores
from gensim.models import CoherenceModel, LdaModel, HdpModel, nmf, LdaMulticore
from gensim.corpora import Dictionary, csvcorpus
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import *
from time import time
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
from ipywidgets import interact, interactive, IntSlider, Layout, interact_manual, fixed, interactive_output, FloatSlider
import ipywidgets as widgets
import qgrid
import logging

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  _df_json = Unicode('', sync=True)


### Set up corpus

Set up test corpus, should be tokenized, sotpwords removed

In [2]:
%%time

elections = io.open('../data/Election2008Paragraphes.txt',encoding = "ISO-8859-1")
electionlines  =elections.readlines()

CUSTOM_FILTERS = [lambda x: x.lower(),  strip_punctuation, strip_multiple_whitespaces, strip_numeric,remove_stopwords, strip_short]

texts = [preprocess_string(line, filters=CUSTOM_FILTERS) for line in electionlines]


dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tf_vectorizer = CountVectorizer()
tftexts = [' '.join(text) for text in texts]
tf = tf_vectorizer.fit_transform(tftexts)
tf[tf>1]=1
xc = (tf.T * tf)
cooccur =xc.todense()

data = [texts, dictionary, corpus, tf_vectorizer, tf, cooccur]


CPU times: user 4.26 s, sys: 882 ms, total: 5.14 s
Wall time: 5.3 s


In [10]:
#changes made to FARotate in the code will require kernel restart if reloading this cell!
m = coherence_widget(data)
display(m)


aW50ZXJhY3RpdmUoY2hpbGRyZW49KFJhZGlvQnV0dG9ucyhkZXNjcmlwdGlvbj11J0Nob29zZSBNb2RlbCcsIGxheW91dD1MYXlvdXQod2lkdGg9dScyNTBweCcpLCBvcHRpb25zPSgnTk1GJyzigKY=


In [5]:
topics, coherences, both= m.result

In [6]:
pd.DataFrame(topics)

Unnamed: 0,Topic
0,care health insurance plan costs quality americans families companies veterans
1,america world people today global energy nation future work like
2,need energy world oil security change national nuclear military time
3,people know american want going think good said got like
4,government american federal people security economy way time trust spending
5,iraq troops military war iran president qaeda afghanistan security plan
6,president country want states believe time united going change thank
7,war time world years like bush end going said americans
8,tax jobs families help taxes workers americans working middle economy
9,new energy jobs create nuclear york years like time use


In [30]:
from __future__ import division
from process_topics import coherence_scores
from similarity import *
style = {'width': '1000px', 'height': '30px'}

w=IntSlider(10,0,1000, description = 'w',layout=style)
w2=IntSlider(10,0,1000, description = 'w2',layout=style) 
co=IntSlider(10,0,1000, description = 'co',layout=style)
ndocs=IntSlider(1000,0,10000, description = 'ndocs',layout=style)
print co.value
exp = FloatSlider(value=0.0, min=0, max=20, description='exp', layout=style)
joint = FloatSlider(value=0.0, min=0, max=20, description='exp', layout=style)

exp.style.handle_color= 'yellow'
#print widgets.__dict__
ui = widgets.VBox([w, w2, co, ndocs,exp])

    
def similarities(w,w2,co,ndocs, held=[]):
    scores = calculate_sims(w,w2,co,ndocs,'all')
    #held.append(w)
    display(pd.DataFrame(scores))
    return scores

out =interactive_output(similarities,
         {'w':w,'w2':w2,'co':co,'ndocs':ndocs})

out.layout.height= '500px'
display(ui,out)

@out.capture()
def set_exp_per_rest(change):
    print (w.value+ w2.value)/ ndocs.value
    exp.value = ((w.value+ w2.value)/ ndocs.value) *100
    print exp.value 
    
w.observe(set_exp_per_rest, names="value")
w2.observe(set_exp_per_rest, names="value")
co.observe(set_exp_per_rest, names="value")
ndocs.observe(set_exp_per_rest, names="value")


    

10


VkJveChjaGlsZHJlbj0oSW50U2xpZGVyKHZhbHVlPTEwLCBkZXNjcmlwdGlvbj11J3cnLCBsYXlvdXQ9TGF5b3V0KGhlaWdodD11JzMwcHgnLCB3aWR0aD11JzEwMDBweCcpLCBtYXg9MTAwMCnigKY=


Output(layout=Layout(height=u'500px'))

In [None]:
full =os.listdir('/Users/timothypowell/Downloads/movie')


In [2]:
import os

plots = []
with open('../data/movieplotsawk') as f:
    plots =f.readlines()


plots = [i.split() for i in plots]
dictionarymovie = Dictionary(plots)
corpusmovie = [dictionarymovie.doc2bow(text) for text in plots]

dictionarymovie.token2id['strip']


In [11]:

tmfile =open('../data/topicsMovie.txt')
topicsmovie = [i.rstrip('\n').split() for i in tmfile.readlines()]

In [12]:

#moviecoherence = coherence_scores(coherence='all', corpus= corpusmovie, dictionary =dictionarymovie, topics= topicsmovie)
what= coherence_scores(coherence='all', corpus= corpusmovie, dictionary =dictionarymovie, topics= topicsmovie)

UWdyaWRXaWRnZXQoZ3JpZF9vcHRpb25zPXsnaGlnaGxpZ2h0U2VsZWN0ZWRSb3cnOiBUcnVlLCAnZnVsbFdpZHRoUm93cyc6IFRydWUsICdyb3dIZWlnaHQnOiAyOCwgJ2VuYWJsZUNvbHVtblLigKY=


In [33]:
tmgold = open('../data/goldMovie.txt')
ratingsmovie = [float(i.rstrip('\n')) for i in tmgold.readlines()]
ratingsdict = {"ratings": ratingsmovie}
ratingsdf = pd.DataFrame(ratingsdict)
ratingsandcoherence =pd.concat([ratingsdf,pd.DataFrame(what)], axis =1)

display(ratingsandcoherence.sort_values('ratings'))

Unnamed: 0,ratings,association,chisquare,dice,gmean,inclusion,jaccard,joint_prob,log_cond,npmi,pmi,pmi2,zscore
81,0.000,0.090305,0.142337,0.115733,0.271017,0.488675,0.139138,0.542996,-1.336708,0.176333,9.487617,9.490893,9.435057
88,0.125,0.034875,0.099395,0.083470,0.175523,0.233863,0.092503,0.158329,-1.592453,0.166754,9.596125,9.599773,7.672908
18,0.125,0.084798,0.118868,0.112031,0.255031,0.438781,0.133919,0.548810,-1.596898,0.134314,9.300867,9.304370,7.586189
8,0.125,0.038650,0.104009,0.082443,0.189006,0.319059,0.090520,0.204164,-1.276650,0.168968,9.602649,9.606264,7.746514
28,0.125,0.020949,0.080097,0.050569,0.134449,0.315396,0.053949,0.105192,-2.348463,0.174108,9.807114,9.815277,6.028312
6,0.250,0.029761,0.092828,0.077850,0.167205,0.238849,0.085005,0.149540,-1.558869,0.164732,9.617456,9.621248,7.139559
29,0.250,0.133683,0.180632,0.160285,0.350840,0.523166,0.197065,0.843835,-1.267662,0.192663,9.412032,9.413645,11.432369
43,0.250,0.047474,0.149946,0.092405,0.200341,0.292121,0.104835,0.143050,-1.289781,0.264426,10.059618,10.065207,11.787031
80,0.250,0.049019,0.109873,0.075004,0.194551,0.439574,0.084564,0.280422,-1.828969,0.167591,9.600189,9.605697,7.536683
34,0.375,0.118811,0.184501,0.157256,0.336802,0.473219,0.188745,0.692537,-1.152141,0.206697,9.478885,9.480493,12.287940


In [21]:
display(pd.DataFrame(what))

AttributeError: 'list' object has no attribute 'update'

In [108]:
np.set_printoptions(threshold=np.inf, linewidth=1000)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1500)
pd.set_option('display.column_space', 2)

texts2 = [preprocess_string(line, filters=CUSTOM_FILTERS) for line in e[1:5]]
for tex in texts2:
    print ' '.join(tex)
tf_vectorizer = CountVectorizer()
tftexts = [' '.join(text) for text in texts2]
tf = tf_vectorizer.fit_transform(tftexts)
tf[tf>1]=1

xc = (tf.T * tf)
print 'occur'
print tf.toarray()
print 'cooccur'
tf_feature_names = tf_vectorizer.get_feature_names()
for i,name in enumerate(tf_feature_names):
    width = 12
    spacer =width - (len(str(i)) + len(name))
    if i-1%10==0 or i==0:
        print i,name,' '*spacer,
    else:
        print i,name,' '*spacer,
print ''
cooccur =xc.todense()
for i,x in enumerate(cooccur):
    print tf_feature_names[i],tf_feature_names[i+i]
    print float(cosine_similarity(x,cooccur[i+1])), float(cosine_distances(x,cooccur[i+1])), 'eucl ', float(euclidean_distances(x,cooccur[i+1]))
display(pd.DataFrame(cooccur))



 year democratic unionists face historic choice share political power nationalist community
year meet hope mother prayer taken government hands ireland road permanent peace
yeats said long sacrifice stone heart wait hearts turn stone challenge political leaders follow lincoln called better angels nature century look forward write new chapter irish history
want thank honor know mother loved adored awards given makes feel totally worthy mother eyes
occur
[[0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 1]
 [1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0]]
cooccur
0 adored       1 angels       2 awards    

IndexError: list index out of range

In [109]:
cooccur =xc.todense()

display(pd.DataFrame(cooccur))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0,0
1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
2,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0,0
3,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
4,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
5,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
6,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
7,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1
8,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0



