# Topic Modelling & NER 

Import all the needed libraries

In [1]:
import re
import os
import nltk
import pandas as pd
import numpy as np
import scipy
import codecs
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pprint as pprint
from gensim.models import Phrases
from gensim import models, corpora
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In the code above all the NIPS papers are going to be inserted into a list, ignoring all the characters that provoke encoding errors. The number of docs should be 1740.

In [2]:
dir = 'nipstxt/'
years = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
subDirs = ['nips' + yr for yr in years]
docs = []

In [3]:
for element in subDirs:
    doc_files = os.listdir(dir + element)
    for item in doc_files:
        with codecs.open(dir + element + '/' + item, encoding='utf-8', errors='ignore') as doc_file:
            text = doc_file.read()
        docs.append(text)
        
print len(docs)

1740


In the section below we are going to perform the needed preprocessing. To be more specific we are going to remove stopwords, transform all words to lowercase and remove punctuation. Then we lemmatize the words and keep only those whose length exceed the size of 3 characters.

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
countVectorizer = CountVectorizer()
tfidfVectorizer = TfidfVectorizer()
punctFree = re.compile('.*[A-Za-z].*')

In [5]:
def cleaning(text):
    token = nltk.word_tokenize(text)
    lowerCase = [word.lower() for word in token]
    raw_words = [tok for tok in lowerCase if punctFree.match(tok)]
    filtering = list(filter(lambda l: l not in stop_words, raw_words))
    lemmas = [lemmatizer.lemmatize(t) for t in filtering]
    lemmas = [lemma for lemma in lemmas if len(lemma) > 2]
    return lemmas

In [6]:
tokens = []
for text in docs:
    tokens.append(cleaning(text))

In [7]:
len(tokens[0])

2538

We are going to join the tokens of a doc and vectorize the words via term frequencies and via inverse document frequencies TF-IDF

In [8]:
# Join the tokens of a paper to be able to vectorize the words
joined_tokens = [" ".join(item) for item in tokens]
count_vectors = countVectorizer.fit_transform(joined_tokens)
tfidf_vectors = tfidfVectorizer.fit_transform(joined_tokens)

In order to use LDA we have to change the form of data. We are going to use the Dictionary method of gensim which will create the required form. In other words, we are goin to have a dictionary which contains (term, count of term in docs). Also we are going to remove the rare words because we assume that they are meaningless in comparison with the most frequent words. Finally, we check the form of the dictionary.

In [9]:
dict = corpora.Dictionary(tokens)
dict.filter_extremes(no_below=20, no_above=0.8)

In [10]:
corpus = [dict.doc2bow(doc) for doc in tokens]

In [11]:
for i in corpus[2]:
    print (dict[i[0]], i[1])

(u'1st', 2)
(u'able', 2)
(u'adjust', 1)
(u'adjusted', 3)
(u'algorithm', 3)
(u'along', 2)
(u'american', 1)
(u'analysis', 3)
(u'another', 1)
(u'appeared', 1)
(u'area', 6)
(u'assigned', 1)
(u'available', 3)
(u'back', 22)
(u'behavior', 1)
(u'brain', 2)
(u'change', 1)
(u'combined', 4)
(u'complex', 13)
(u'computer', 1)
(u'conclusion', 1)
(u'conference', 4)
(u'connected', 1)
(u'connection', 4)
(u'context', 1)
(u'could', 1)
(u'cybernetics', 1)
(u'demonstrated', 2)
(u'department', 1)
(u'depends', 1)
(u'descent', 1)
(u'described', 3)
(u'desired', 5)
(u'detail', 1)
(u'determine', 3)
(u'diagram', 1)
(u'difference', 2)
(u'difficult', 3)
(u'discrete', 1)
(u'discus', 1)
(u'distribution', 1)
(u'enough', 1)
(u'equation', 1)
(u'error', 15)
(u'estimate', 2)
(u'fed', 1)
(u'feedback', 1)
(u'finite', 1)
(u'fixed', 20)
(u'fixing', 1)
(u'form', 24)
(u'formed', 16)
(u'forming', 3)
(u'four', 1)
(u'framework', 2)
(u'gain', 1)
(u'general', 2)
(u'generated', 2)
(u'hidden', 15)
(u'high', 5)
(u'illustrated', 1)
(u'i

LDA do not give attention to the order of words so we are going to use the bag of words. Each document is a distribution over topics. Each topic is a distribution over words which belong to the vocabulary. 
So we are going to choose a distribution over topics, draw a topic and choose words from the topic.


We are going to use different number of topics and change the a. 

# Num_topics = 10
## alpha = auto

In [12]:
lda_10_auto = models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=10,
                                          alpha='auto')
lda_10_auto.print_topics()

[(0,
  u'0.010*"learning" + 0.007*"data" + 0.007*"algorithm" + 0.006*"unit" + 0.005*"state" + 0.005*"error" + 0.004*"image" + 0.004*"training" + 0.004*"parameter" + 0.004*"point"'),
 (1,
  u'0.010*"learning" + 0.008*"data" + 0.005*"pattern" + 0.005*"neuron" + 0.005*"training" + 0.005*"unit" + 0.004*"algorithm" + 0.004*"state" + 0.004*"output" + 0.004*"vector"'),
 (2,
  u'0.008*"learning" + 0.006*"algorithm" + 0.005*"parameter" + 0.005*"training" + 0.005*"image" + 0.005*"weight" + 0.005*"method" + 0.005*"neuron" + 0.004*"unit" + 0.004*"error"'),
 (3,
  u'0.009*"learning" + 0.008*"weight" + 0.008*"output" + 0.008*"algorithm" + 0.008*"unit" + 0.006*"training" + 0.006*"data" + 0.005*"error" + 0.005*"vector" + 0.004*"neuron"'),
 (4,
  u'0.007*"learning" + 0.007*"unit" + 0.007*"data" + 0.006*"output" + 0.005*"weight" + 0.005*"training" + 0.005*"algorithm" + 0.004*"error" + 0.004*"pattern" + 0.004*"vector"'),
 (5,
  u'0.009*"data" + 0.007*"learning" + 0.007*"algorithm" + 0.006*"training" + 0.

By observing the topic 0 we can see that the 10 most important by contribution to the topic are:
weight  (0.007)
data (0.007)  
output  (0.007)
learning (0.007)
unit  (0.007)
neuron  (0.005) 
training   (0.005)
cell (0.005)  
feature  (0.005)
vector (0.004)

We are going to use perplexity and coherence as measures to evaluate our model

In [13]:
lda_10_auto.log_perplexity(corpus)

-7.7831372701548123

In [14]:
coherence_10_auto = CoherenceModel(model=lda_10_auto, texts=tokens, dictionary=dict, coherence='c_v')
coherence_10_auto.get_coherence()

0.29087387861489111

# Num_topics = 20
## alpha = auto

In [15]:
lda_20_auto = models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=20,
                                          alpha='auto')
lda_20_auto.print_topics()

[(0,
  u'0.009*"unit" + 0.006*"training" + 0.005*"neuron" + 0.005*"output" + 0.005*"data" + 0.004*"representation" + 0.004*"pattern" + 0.004*"feature" + 0.004*"error" + 0.004*"algorithm"'),
 (1,
  u'0.009*"learning" + 0.008*"data" + 0.006*"output" + 0.005*"training" + 0.004*"weight" + 0.004*"feature" + 0.004*"algorithm" + 0.004*"state" + 0.004*"probability" + 0.004*"unit"'),
 (2,
  u'0.010*"learning" + 0.007*"unit" + 0.007*"training" + 0.006*"data" + 0.006*"output" + 0.005*"image" + 0.005*"hidden" + 0.004*"method" + 0.004*"error" + 0.004*"weight"'),
 (3,
  u'0.009*"learning" + 0.008*"data" + 0.006*"error" + 0.006*"state" + 0.005*"training" + 0.005*"unit" + 0.005*"method" + 0.005*"algorithm" + 0.004*"weight" + 0.004*"output"'),
 (4,
  u'0.010*"neuron" + 0.008*"unit" + 0.008*"learning" + 0.007*"output" + 0.006*"layer" + 0.005*"weight" + 0.005*"data" + 0.005*"error" + 0.004*"cell" + 0.004*"training"'),
 (5,
  u'0.008*"training" + 0.008*"learning" + 0.007*"weight" + 0.006*"error" + 0.005*"

In [16]:
lda_20_auto.log_perplexity(corpus)

-7.8539139338855959

In [17]:
coherence_20_auto = CoherenceModel(model=lda_20_auto, texts=tokens, dictionary=dict, coherence='c_v')
coherence_20_auto.get_coherence()

0.2987799304345764

# Num_topics = 20
## alpha = assymetric

We are going to keep the second case because we have observed that the number of perplexity is lower. In the following chapters we are going to change the parameter a.

In [18]:
lda_20_assymetric = models.ldamodel.LdaModel(corpus=corpus, 
                                          id2word=dict, 
                                          num_topics=20, 
                                          alpha='asymmetric')
lda_20_assymetric.print_topics()

[(0,
  u'0.009*"learning" + 0.008*"data" + 0.007*"error" + 0.006*"state" + 0.006*"algorithm" + 0.006*"unit" + 0.005*"method" + 0.005*"weight" + 0.004*"output" + 0.004*"training"'),
 (1,
  u'0.006*"learning" + 0.006*"unit" + 0.005*"data" + 0.005*"weight" + 0.004*"method" + 0.004*"error" + 0.004*"output" + 0.004*"feature" + 0.004*"training" + 0.004*"vector"'),
 (2,
  u'0.010*"learning" + 0.008*"data" + 0.008*"unit" + 0.005*"algorithm" + 0.005*"output" + 0.005*"error" + 0.004*"neuron" + 0.004*"image" + 0.004*"method" + 0.004*"layer"'),
 (3,
  u'0.013*"learning" + 0.008*"unit" + 0.008*"output" + 0.008*"training" + 0.007*"weight" + 0.006*"state" + 0.006*"algorithm" + 0.005*"data" + 0.005*"error" + 0.005*"pattern"'),
 (4,
  u'0.009*"learning" + 0.008*"data" + 0.008*"training" + 0.007*"output" + 0.007*"unit" + 0.006*"algorithm" + 0.005*"class" + 0.005*"hidden" + 0.004*"state" + 0.004*"vector"'),
 (5,
  u'0.011*"learning" + 0.008*"algorithm" + 0.008*"unit" + 0.007*"error" + 0.006*"weight" + 0.

In [19]:
lda_20_assymetric.log_perplexity(corpus)

-7.8586099595881613

In [20]:
coherence_20_assymetric = CoherenceModel(model=lda_20_assymetric, texts=tokens, dictionary=dict, coherence='c_v')
coherence_20_assymetric.get_coherence()

0.29291805017452177

The code below creates the visualization of the best model.

In [21]:
visualization = pyLDAvis.gensim.prepare(lda_20_auto, corpus, dict)
pyLDAvis.save_html(visualization, "visualization.html")

Below you can see the strongest topic representation for each one of the texts.

In [22]:
lda_results = lda_20_auto[corpus]

best = []
i = 0

for res in lda_results:
    bs = max(res, key=lambda item:item[1])
    best.append((bs, i))
    i+=1
best[:10]

[((4, 0.44820261), 0),
 ((8, 0.5848825), 1),
 ((5, 0.79795474), 2),
 ((9, 0.48610616), 3),
 ((4, 0.22923909), 4),
 ((4, 0.30523324), 5),
 ((4, 0.69495332), 6),
 ((4, 0.45695239), 7),
 ((4, 0.58555472), 8),
 ((19, 0.8064993), 9)]

Below we will find the paper that best represent the topic 

In [23]:
topics = [i for i in range(10)]

for item in topics:
    fmax = []
    for res in best:
        if res[0][0] == item:
            fmax.append(res)
    #print fmax
    if not fmax:
        print 'None'
    else:
        res =max(fmax, key=lambda x:x[0][1])
        print res

((0, 0.9984014), 1053)
((1, 0.96706814), 1588)
((2, 0.93216282), 526)
((3, 0.96038526), 1572)
((4, 0.98709542), 1672)
((5, 0.99657702), 1050)
((6, 0.99560124), 665)
((7, 0.99865419), 1247)
((8, 0.99478024), 696)
((9, 0.98445565), 1512)
