## Assignment 6

In [9]:
import time
import math
import re
import pandas as pd
from textblob import TextBlob
import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
from gensim import corpora, models

## TF-IDF on news articles

In [15]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [48]:
news_articles = 'webhose_cat.pkl'

In [49]:
news_df = pd.read_pickle(news_articles)

In [50]:
news_df.head(5)

Unnamed: 0,crawled,language,text,title,url
0,2018-01-30T18:28:45.012+02:00,english,Avery Dennison's (AVY) Q4 results are likely to gain on the back of solid momentum in its segmen...,IRobot downgraded to neutral from buy at Sidoti & Co.,http://omgili.com/ri/.wHSUbtEfZQRfU.5KUm1RkeXyWwa2NebtNAe6c3kk40xYUfmvvRJ0Ez90L4hcpBcEC67530Esyg-
1,2018-01-30T18:29:07.001+02:00,french,"1m95, c’est trop grand. Et sa stature, Bertrand Zibi Abeghe en paie le prix. Dans sa cellule de ...","""Bertrand Zibi Abeghe, encore prisonnier, et torturé"", un récit de Jeanne Farge",http://omgili.com/ri/.wHSUbtEfZTpzFtnXyQJIwJ.jePivduuQxl7XJliqPANqwhuqSUg_E.s9oUhmDhJMs2xv3xWR.0...
2,2018-01-30T18:29:40.000+02:00,english,Tuggers and Topper Industrial Carts Help Transport Materials Between Manufacturing Plants and Wa...,Tuggers and Topper Industrial Carts Help Transport Materials Between Manufacturing Plants and Wa...,http://omgili.com/ri/jHIAmI4hxg.zDiulpymXqU_n4je9AOsBi7.CcJmqqQi3Efu9fXRTSIPQkf8q0eN1ZnYNKk.TIPo...
3,2018-01-30T18:30:05.007+02:00,english,Currently adding the following games:\n100 (by everythingstaken) 100 Free Beetles 2000:1: A Spac...,,http://omgili.com/ri/.0rSU5LtMgyggHgoOVy9TMDWTEMJi4jQ_AfNI6u4qLL71YWM0VxMlyX.KDtAoU1g1Rx.SnmUgSf...
4,2018-01-30T18:30:05.013+02:00,english,Quote: : » Currently adding the following games:\n100 (by everythingstaken) 100 Free Beetles 200...,,http://omgili.com/ri/.0rSU5LtMgyggHgoOVy9TMDWTEMJi4jQ_AfNI6u4qLL71YWM0VxMlyX.KDtAoU1g1Rx.SnmUgSf...


In [51]:
news_df.shape

(100, 5)

In [52]:
# Filter non-English tweets
news_eng = news_df[news_df['language']=='english'].reset_index(drop=True)

In [53]:
# Remove special characters to avoid problems with analysis
news_eng['text_clean'] = news_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [54]:
pd.set_option('display.max_colwidth', 100)
news_eng[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,Avery Dennison's (AVY) Q4 results are likely to gain on the back of solid momentum in its segmen...,"Avery Dennisons AVY Q4 results are likely to gain on the back of solid momentum in its segments,..."
1,Tuggers and Topper Industrial Carts Help Transport Materials Between Manufacturing Plants and Wa...,Tuggers and Topper Industrial Carts Help Transport Materials Between Manufacturing Plants and Wa...
2,Currently adding the following games:\n100 (by everythingstaken) 100 Free Beetles 2000:1: A Spac...,Currently adding the following games:100 by everythingstaken 100 Free Beetles 2000:1: A Space Fe...
3,Quote: : » Currently adding the following games:\n100 (by everythingstaken) 100 Free Beetles 200...,Quote: : Currently adding the following games:100 by everythingstaken 100 Free Beetles 2000:1: ...
4,Quote: : » Currently adding the following games:\n100 (by everythingstaken) 100 Free Beetles 200...,Quote: : Currently adding the following games:100 by everythingstaken 100 Free Beetles 2000:1: ...


In [55]:
bloblist = []
del bloblist[:]

for i in range(0,len(news_eng)):
    bloblist.append(TextBlob(news_eng['text_clean'].iloc[i]))
    
len(bloblist) 

95

In [56]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in news article {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:10]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in news article 1
	Word: ET, TF-IDF: 0.11986
	Word: On, TF-IDF: 0.0776
	Word: a.m, TF-IDF: 0.06613
	Word: Jan, TF-IDF: 0.06186
	Word: p.m, TF-IDF: 0.05373
	Word: Zacks.com, TF-IDF: 0.0508
	Word: 2018, TF-IDF: 0.03319
	Word: Oct, TF-IDF: 0.03306
	Word: 2017, TF-IDF: 0.0308
	Word: iRobot, TF-IDF: 0.02893
Top words in news article 2
	Word: Carts, TF-IDF: 0.09488
	Word: carts, TF-IDF: 0.08698
	Word: Topper, TF-IDF: 0.07248
	Word: To, TF-IDF: 0.06167
	Word: Industrial, TF-IDF: 0.04261
	Word: industrial, TF-IDF: 0.04261
	Word: operators, TF-IDF: 0.02899
	Word: handling, TF-IDF: 0.02696
	Word: Warehouses, TF-IDF: 0.02372
	Word: Transport, TF-IDF: 0.02175
Top words in news article 3
	Word: THE, TF-IDF: 0.14393
	Word: OF, TF-IDF: 0.0455
	Word: And, TF-IDF: 0.02571
	Word: Space, TF-IDF: 0.0195
	Word: Tower, TF-IDF: 0.0195
	Word: Super, TF-IDF: 0.01829
	Word: 2, TF-IDF: 0.01822
	Word: The, TF-IDF: 0.01765
	Word: Last, TF-IDF: 0.01727
	Word: adding, TF-IDF: 0.01398
Top words in news arti

### Applying LDA to news articles

In [90]:
news_list = news_eng['text_clean'].tolist()
#news_list[:2]

In [58]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [59]:
news_clean = [clean(doc).split() for doc in news_list]
len(news_clean)

95

In [60]:
print(*news_clean[:1], sep='\n\n')

['avery', 'dennisons', 'avy', 'q4', 'result', 'likely', 'gain', 'back', 'solid', 'momentum', 'segment', 'focus', 'productivity', 'acquisition', 'cost', 'control', 'effort', 'jan', '30', '2018', '340', 'am', 'et', 'zackscom', '5', 'strong', 'stock', 'company', 'still', 'led', 'founder', 'day', 'executive', 'transition', 'often', 'appease', 'shareholder', 'especially', 'tough', 'timeseven', 'mean', 'axing', 'company', 'founder', 'top', 'that', 'ceo', 'jump', 'company', 'company', 'making', 'increasingly', 'rare', 'see', 'founder', 'successfully', 'lead', 'startup', 'publicly', 'traded', 'giant', 'keep', 'top', 'gig', 'jan', '25', '2018', '1142', 'am', 'et', 'zackscom', 'rockwell', 'automation', 'rok', 'expects', 'benefit', 'expanded', 'product', 'offering', 'strong', 'customer', 'relationship', 'strategic', 'focus', 'connected', 'enterprise', 'jan', '24', '2018', '323', 'pm', 'et', 'zackscom', 'pentairs', 'pnr', 'q4', 'result', 'likely', 'gain', 'residential', 'commercial', 'sector', 'de

In [61]:
# Creating the term dictionary of the courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(news_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

CPU times: user 31.6 ms, sys: 1.38 ms, total: 33 ms
Wall time: 33.1 ms


#### 3 topic model

In [67]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
numtopics = 3

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

CPU times: user 1min 49s, sys: 1.76 s, total: 1min 51s
Wall time: 59.5 s


In [68]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

(0, '0.010*"inc" + 0.010*"company" + 0.009*"jan"')

(1, '0.009*"tax" + 0.007*"u" + 0.006*"city"')

(2, '0.014*"market" + 0.010*"plant" + 0.005*"case"')


In [69]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.010*"inc" + 0.010*"company" + 0.009*"jan" + 0.009*"share" + 0.009*"caterpillar"')

(1, '0.009*"tax" + 0.007*"u" + 0.006*"city" + 0.005*"one" + 0.005*"company"')

(2, '0.014*"market" + 0.010*"plant" + 0.005*"case" + 0.005*"sphere" + 0.005*"amazon"')


In [70]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.010*"inc" + 0.010*"company" + 0.009*"jan" + 0.009*"share" + 0.009*"caterpillar" + 0.005*"product" + 0.005*"stock"')

(1, '0.009*"tax" + 0.007*"u" + 0.006*"city" + 0.005*"one" + 0.005*"company" + 0.004*"median" + 0.004*"university"')

(2, '0.014*"market" + 0.010*"plant" + 0.005*"case" + 0.005*"sphere" + 0.005*"amazon" + 0.004*"seattle" + 0.004*"industry"')


In [71]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.010*"inc" + 0.010*"company" + 0.009*"jan" + 0.009*"share" + 0.009*"caterpillar" + 0.005*"product" + 0.005*"stock" + 0.004*"2018" + 0.004*"iot" + 0.004*"support"')

(1, '0.009*"tax" + 0.007*"u" + 0.006*"city" + 0.005*"one" + 0.005*"company" + 0.004*"median" + 0.004*"university" + 0.004*"estimate" + 0.004*"state" + 0.004*"rate"')

(2, '0.014*"market" + 0.010*"plant" + 0.005*"case" + 0.005*"sphere" + 0.005*"amazon" + 0.004*"seattle" + 0.004*"industry" + 0.004*"report" + 0.004*"also" + 0.004*"year"')


#### 5 topic model

In [72]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
numtopics = 5

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

CPU times: user 1min 45s, sys: 1.17 s, total: 1min 46s
Wall time: 57.6 s


In [73]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

(0, '0.016*"caterpillar" + 0.012*"company" + 0.012*"share"')

(1, '0.011*"sphere" + 0.010*"amazon" + 0.010*"seattle"')

(2, '0.023*"market" + 0.014*"plant" + 0.008*"case"')

(3, '0.011*"tax" + 0.009*"u" + 0.007*"city"')

(4, '0.011*"inc" + 0.010*"jan" + 0.006*"iot"')


In [74]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.016*"caterpillar" + 0.012*"company" + 0.012*"share" + 0.008*"product" + 0.008*"industrial"')

(1, '0.011*"sphere" + 0.010*"amazon" + 0.010*"seattle" + 0.006*"2018" + 0.006*"monday"')

(2, '0.023*"market" + 0.014*"plant" + 0.008*"case" + 0.007*"industry" + 0.007*"report"')

(3, '0.011*"tax" + 0.009*"u" + 0.007*"city" + 0.006*"company" + 0.005*"median"')

(4, '0.011*"inc" + 0.010*"jan" + 0.006*"iot" + 0.005*"blade" + 0.005*"support"')


In [75]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.016*"caterpillar" + 0.012*"company" + 0.012*"share" + 0.008*"product" + 0.008*"industrial" + 0.008*"stock" + 0.007*"cart"')

(1, '0.011*"sphere" + 0.010*"amazon" + 0.010*"seattle" + 0.006*"2018" + 0.006*"monday" + 0.006*"grand" + 0.006*"opening"')

(2, '0.023*"market" + 0.014*"plant" + 0.008*"case" + 0.007*"industry" + 0.007*"report" + 0.005*"also" + 0.005*"growth"')

(3, '0.011*"tax" + 0.009*"u" + 0.007*"city" + 0.006*"company" + 0.005*"median" + 0.005*"state" + 0.005*"estimate"')

(4, '0.011*"inc" + 0.010*"jan" + 0.006*"iot" + 0.005*"blade" + 0.005*"support" + 0.005*"skid" + 0.004*"2018"')


In [85]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.018*"company" + 0.016*"health" + 0.011*"new" + 0.010*"care" + 0.010*"employee" + 0.009*"cost" + 0.008*"healthcare" + 0.006*"may" + 0.006*"inc" + 0.006*"jpmorgan"')

(1, '0.016*"coating" + 0.016*"biopolymer" + 0.010*"year" + 0.010*"planer" + 0.010*"cold" + 0.005*"image" + 0.005*"million" + 0.004*"might" + 0.004*"like" + 0.004*"getty"')

(2, '0.014*"median" + 0.014*"city" + 0.013*"estimate" + 0.013*"university" + 0.011*"2017" + 0.010*"town" + 0.009*"state" + 0.008*"zillow" + 0.007*"major" + 0.007*"home"')

(3, '0.015*"company" + 0.014*"caterpillar" + 0.012*"share" + 0.008*"stock" + 0.008*"product" + 0.008*"iot" + 0.006*"rating" + 0.006*"blade" + 0.006*"pusher" + 0.005*"end"')

(4, '0.016*"jan" + 0.014*"sphere" + 0.014*"inc" + 0.014*"amazon" + 0.013*"seattle" + 0.009*"29" + 0.007*"monday" + 0.007*"grand" + 0.007*"opening" + 0.007*"2018"')

(5, '0.017*"tax" + 0.013*"plant" + 0.012*"u" + 0.008*"case" + 0.007*"china" + 0.006*"repatriation" + 0.006*"year" + 0.005*"one" + 0.005*"would" 

In [77]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.016*"caterpillar" + 0.012*"company" + 0.012*"share" + 0.008*"product" + 0.008*"industrial" + 0.008*"stock" + 0.007*"cart" + 0.006*"rating" + 0.006*"forklift" + 0.005*"quarter"')

(1, '0.011*"sphere" + 0.010*"amazon" + 0.010*"seattle" + 0.006*"2018" + 0.006*"monday" + 0.006*"grand" + 0.006*"opening" + 0.005*"29" + 0.005*"bezos" + 0.005*"house"')

(2, '0.023*"market" + 0.014*"plant" + 0.008*"case" + 0.007*"industry" + 0.007*"report" + 0.005*"also" + 0.005*"growth" + 0.005*"analysis" + 0.005*"wardian" + 0.005*"truck"')

(3, '0.011*"tax" + 0.009*"u" + 0.007*"city" + 0.006*"company" + 0.005*"median" + 0.005*"state" + 0.005*"estimate" + 0.005*"china" + 0.005*"university" + 0.005*"2017"')

(4, '0.011*"inc" + 0.010*"jan" + 0.006*"iot" + 0.005*"blade" + 0.005*"support" + 0.005*"skid" + 0.004*"2018" + 0.004*"pusher" + 0.004*"corp" + 0.004*"oct"')


#### 10 topic model

In [78]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
numtopics = 10

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

CPU times: user 1min 47s, sys: 1.14 s, total: 1min 48s
Wall time: 1min


In [79]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

(0, '0.018*"company" + 0.016*"health" + 0.011*"new"')

(1, '0.016*"coating" + 0.016*"biopolymer" + 0.010*"year"')

(2, '0.014*"median" + 0.014*"city" + 0.013*"estimate"')

(3, '0.015*"company" + 0.014*"caterpillar" + 0.012*"share"')

(4, '0.016*"jan" + 0.014*"sphere" + 0.014*"inc"')

(5, '0.017*"tax" + 0.013*"plant" + 0.012*"u"')

(6, '0.014*"house" + 0.005*"caterpillar" + 0.005*"like"')

(7, '0.041*"market" + 0.012*"industry" + 0.012*"report"')

(8, '0.011*"2018" + 0.009*"et" + 0.008*"jan"')

(9, '0.011*"operation" + 0.010*"share" + 0.009*"cart"')


In [80]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.018*"company" + 0.016*"health" + 0.011*"new" + 0.010*"care" + 0.010*"employee"')

(1, '0.016*"coating" + 0.016*"biopolymer" + 0.010*"year" + 0.010*"planer" + 0.010*"cold"')

(2, '0.014*"median" + 0.014*"city" + 0.013*"estimate" + 0.013*"university" + 0.011*"2017"')

(3, '0.015*"company" + 0.014*"caterpillar" + 0.012*"share" + 0.008*"stock" + 0.008*"product"')

(4, '0.016*"jan" + 0.014*"sphere" + 0.014*"inc" + 0.014*"amazon" + 0.013*"seattle"')

(5, '0.017*"tax" + 0.013*"plant" + 0.012*"u" + 0.008*"case" + 0.007*"china"')

(6, '0.014*"house" + 0.005*"caterpillar" + 0.005*"like" + 0.005*"accounting" + 0.005*"one"')

(7, '0.041*"market" + 0.012*"industry" + 0.012*"report" + 0.009*"analysis" + 0.008*"growth"')

(8, '0.011*"2018" + 0.009*"et" + 0.008*"jan" + 0.006*"2" + 0.006*"2017"')

(9, '0.011*"operation" + 0.010*"share" + 0.009*"cart" + 0.009*"jan" + 0.008*"forklift"')


In [81]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.018*"company" + 0.016*"health" + 0.011*"new" + 0.010*"care" + 0.010*"employee" + 0.009*"cost" + 0.008*"healthcare"')

(1, '0.016*"coating" + 0.016*"biopolymer" + 0.010*"year" + 0.010*"planer" + 0.010*"cold" + 0.005*"image" + 0.005*"million"')

(2, '0.014*"median" + 0.014*"city" + 0.013*"estimate" + 0.013*"university" + 0.011*"2017" + 0.010*"town" + 0.009*"state"')

(3, '0.015*"company" + 0.014*"caterpillar" + 0.012*"share" + 0.008*"stock" + 0.008*"product" + 0.008*"iot" + 0.006*"rating"')

(4, '0.016*"jan" + 0.014*"sphere" + 0.014*"inc" + 0.014*"amazon" + 0.013*"seattle" + 0.009*"29" + 0.007*"monday"')

(5, '0.017*"tax" + 0.013*"plant" + 0.012*"u" + 0.008*"case" + 0.007*"china" + 0.006*"repatriation" + 0.006*"year"')

(6, '0.014*"house" + 0.005*"caterpillar" + 0.005*"like" + 0.005*"accounting" + 0.005*"one" + 0.004*"phone" + 0.004*"architect"')

(7, '0.041*"market" + 0.012*"industry" + 0.012*"report" + 0.009*"analysis" + 0.008*"growth" + 0.007*"state" + 0.006*"product"')

(8, '0

In [83]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.018*"company" + 0.016*"health" + 0.011*"new" + 0.010*"care" + 0.010*"employee" + 0.009*"cost" + 0.008*"healthcare" + 0.006*"may" + 0.006*"inc" + 0.006*"jpmorgan"')

(1, '0.016*"coating" + 0.016*"biopolymer" + 0.010*"year" + 0.010*"planer" + 0.010*"cold" + 0.005*"image" + 0.005*"million" + 0.004*"might" + 0.004*"like" + 0.004*"getty"')

(2, '0.014*"median" + 0.014*"city" + 0.013*"estimate" + 0.013*"university" + 0.011*"2017" + 0.010*"town" + 0.009*"state" + 0.008*"zillow" + 0.007*"major" + 0.007*"home"')

(3, '0.015*"company" + 0.014*"caterpillar" + 0.012*"share" + 0.008*"stock" + 0.008*"product" + 0.008*"iot" + 0.006*"rating" + 0.006*"blade" + 0.006*"pusher" + 0.005*"end"')

(4, '0.016*"jan" + 0.014*"sphere" + 0.014*"inc" + 0.014*"amazon" + 0.013*"seattle" + 0.009*"29" + 0.007*"monday" + 0.007*"grand" + 0.007*"opening" + 0.007*"2018"')

(5, '0.017*"tax" + 0.013*"plant" + 0.012*"u" + 0.008*"case" + 0.007*"china" + 0.006*"repatriation" + 0.006*"year" + 0.005*"one" + 0.005*"would" 

#### 15 topic model

In [86]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
numtopics = 15

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

CPU times: user 1min 51s, sys: 1.23 s, total: 1min 52s
Wall time: 1min 3s


In [87]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.009*"year" + 0.007*"project" + 0.006*"caterpillar" + 0.005*"phone" + 0.005*"product" + 0.005*"part" + 0.004*"facility" + 0.004*"manufacturing" + 0.004*"process" + 0.004*"6"')

(1, '0.011*"city" + 0.010*"sphere" + 0.010*"median" + 0.010*"amazon" + 0.010*"seattle" + 0.009*"estimate" + 0.009*"university" + 0.008*"2017" + 0.007*"town" + 0.007*"state"')

(2, '0.055*"market" + 0.018*"industry" + 0.016*"report" + 0.012*"growth" + 0.012*"analysis" + 0.009*"product" + 0.009*"vehicle" + 0.009*"key" + 0.008*"construction" + 0.008*"turbocharger"')

(3, '0.010*"2" + 0.006*"space" + 0.005*"tower" + 0.005*"super" + 0.005*"last" + 0.004*"dungeon" + 0.004*"game" + 0.003*"call" + 0.003*"museum" + 0.003*"adventure"')

(4, '0.038*"market" + 0.027*"biopolymer" + 0.027*"coating" + 0.016*"machinery" + 0.015*"road" + 0.014*"report" + 0.008*"study" + 0.008*"segment" + 0.007*"type" + 0.007*"region"')

(5, '0.010*"canna" + 0.009*"post" + 0.008*"accounting" + 0.007*"caterpillar" + 0.006*"van" + 0.006*"stan

#### 20 topic models

In [88]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
numtopics = 20

# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

CPU times: user 1min 49s, sys: 927 ms, total: 1min 50s
Wall time: 1min 2s


In [89]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.019*"sphere" + 0.018*"amazon" + 0.017*"seattle" + 0.010*"monday" + 0.010*"grand" + 0.010*"opening" + 0.009*"29" + 0.009*"2018" + 0.009*"bezos" + 0.009*"jan"')

(1, '0.004*"beach" + 0.004*"outdoor" + 0.004*"compatriot" + 0.004*"body" + 0.004*"legend" + 0.004*"buyer" + 0.004*"specialty" + 0.004*"science" + 0.002*"terrain" + 0.002*"monolith"')

(2, '0.012*"plate" + 0.009*"cutting" + 0.007*"torch" + 0.007*"bale" + 0.006*"crane" + 0.006*"quick" + 0.006*"cut" + 0.006*"steel" + 0.006*"grapple" + 0.006*"bucket"')

(3, '0.031*"forklift" + 0.017*"autonomous" + 0.015*"truck" + 0.013*"haul" + 0.011*"rio" + 0.010*"tire" + 0.007*"komatsu" + 0.007*"fleet" + 0.006*"tinto" + 0.006*"tintos"')

(4, '0.035*"plant" + 0.021*"case" + 0.012*"wardian" + 0.011*"house" + 0.011*"terrarium" + 0.010*"care" + 0.009*"home" + 0.009*"one" + 0.008*"water" + 0.008*"soil"')

(5, '0.012*"mn" + 0.012*"attachment" + 0.012*"steer" + 0.012*"skid" + 0.010*"tax" + 0.008*"state" + 0.008*"tree" + 0.008*"bucket" + 0.008*"lan

#### As can be seen that when we choose N = 20, there starts to have similarities between topic 4 and 7.                                               Hence we might want to choose the N = 15, in order to keep a balance between the comprehensiveness and duplicates.