# Module 6 - Topic Modelling - Exercises

1. Run the module notebook yourself without referring to the video
2. Make a different selection from the science stories used in the original notebook, and change the variables appropriately
3. Perform an LDA from scratch using your subject selection, and experiment with different values of K

In [1]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import tokenize
from gensim.utils import simple_preprocess
from gensim.corpora.textcorpus import remove_stopwords
from gensim.summarization import keywords
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim

from pattern.en import lemma

import pandas as pd
import json

In [2]:
# load the complete dataset
with open('data/News_Category_Dataset_v2.json', 'r') as f:
    news_list = f.readlines()

# convert each line (string) to json (dict)
news_json = list(map(json.loads,news_list))

print("Number of stories: ",len(news_json))

# view the first 10 elements in the list
news_json[:20]

  and should_run_async(code)


Number of stories:  200853


[{'category': 'CRIME',
  'headline': 'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
  'authors': 'Melissa Jeltsen',
  'link': 'https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89',
  'short_description': 'She left her husband. He killed their children. Just another day in America.',
  'date': '2018-05-26'},
 {'category': 'ENTERTAINMENT',
  'headline': "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
  'authors': 'Andy McDonald',
  'link': 'https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201',
  'short_description': 'Of course it has a song.',
  'date': '2018-05-26'},
 {'category': 'ENTERTAINMENT',
  'headline': 'Hugh Grant Marries For The First Time At Age 57',
  'authors': 'Ron Dicker',
  'link': 'https://www.huffingtonpost.com/entry/hugh-grant-marries_us_5b09212ce4b0568a880b9a8c',
  'short_description

In [3]:
# filter the list for stories that are in the category SCIENCE
science_json = [story for story in news_json if story['category']=='SCIENCE']

# for each, create the 'story' by adding together the headline and the short_description
science_stories = [story['headline']+' - '+story['short_description'] for story in science_json]

print("Number of science stories: ",len(science_stories))

Number of science stories:  2178


  and should_run_async(code)


In [4]:
science_stories

  and should_run_async(code)


['Scientists Turn To DNA Technology To Search For Loch Ness Monster - The researchers plan to scour the Loch Ness next month for evidence of its supposed inhabitant.',
 'Unusual Asteroid Could Be An Interstellar Guest To Our Solar System - The supposed "interstellar immigrant" is located near Jupiter but has an atypical orbit.',
 "China Marks Another Milestone In Quest To Become Space Superpower - It's the first time a rocket designed by a Chinese private company has successfully entered orbit.",
 'Terrifying Clip Shows Why You Should Never Run Under A Tree During Thunderstorms - YIKES!',
 "U.S. Climate Scientists Flee For France To ‚ÄòMake Our Planet Great Again‚Äô - Some of America's top researchers will move to France to continue their research.",
 'Stephen Hawking Finished Mind-Bending Parallel Universe Paper Days Before His Death - The new treatise on the existence of parallel universes was published on Friday.',
 "Mysterious Yellowstone Geyser Eruptions Stump Scientists - Steambo

In [5]:
# Stories to DF
stories_df = pd.DataFrame(science_stories,columns=['story'])
stories_df

  and should_run_async(code)


Unnamed: 0,story
0,Scientists Turn To DNA Technology To Search Fo...
1,Unusual Asteroid Could Be An Interstellar Gues...
2,China Marks Another Milestone In Quest To Beco...
3,Terrifying Clip Shows Why You Should Never Run...
4,U.S. Climate Scientists Flee For France To ‚ÄòMa...
...,...
2173,Treating a World Without Antibiotics? - Becaus...
2174,Russian Cargo Ship Docks At International Spac...
2175,"Robots Play Catch, Starring Agile Justin And R..."
2176,Thomas Edison Voted Most Iconic Inventor In U....


In [25]:
# do this for whole dataframe
stories_df['terms'] = [remove_stopwords(simple_preprocess(story,min_len=3)) for story in stories_df['story']]
stories_df

  and should_run_async(code)


Unnamed: 0,story,terms
0,Scientists Turn To DNA Technology To Search Fo...,"[scientists, turn, dna, technology, search, lo..."
1,Unusual Asteroid Could Be An Interstellar Gues...,"[unusual, asteroid, interstellar, guest, solar..."
2,China Marks Another Milestone In Quest To Beco...,"[china, marks, milestone, quest, space, superp..."
3,Terrifying Clip Shows Why You Should Never Run...,"[terrifying, clip, shows, run, tree, thunderst..."
4,U.S. Climate Scientists Flee For France To ‚ÄòMa...,"[climate, scientists, flee, france, planet, gr..."
...,...,...
2173,Treating a World Without Antibiotics? - Becaus...,"[treating, world, antibiotics, overuse, antibi..."
2174,Russian Cargo Ship Docks At International Spac...,"[russian, cargo, ship, docks, international, s..."
2175,"Robots Play Catch, Starring Agile Justin And R...","[robots, play, catch, starring, agile, justin,..."
2176,Thomas Edison Voted Most Iconic Inventor In U....,"[thomas, edison, voted, iconic, inventor, hist..."


In [27]:
stories_df['lemma'] = [[lemma(wd) for wd in terms] for terms in stories_df['terms']]
stories_df

  and should_run_async(code)


Unnamed: 0,story,terms,lemma
0,Scientists Turn To DNA Technology To Search Fo...,"[scientists, turn, dna, technology, search, lo...","[scientist, turn, dna, technology, search, loc..."
1,Unusual Asteroid Could Be An Interstellar Gues...,"[unusual, asteroid, interstellar, guest, solar...","[unusual, asteroid, interstellar, guest, solar..."
2,China Marks Another Milestone In Quest To Beco...,"[china, marks, milestone, quest, space, superp...","[china, mark, milestone, quest, space, superpo..."
3,Terrifying Clip Shows Why You Should Never Run...,"[terrifying, clip, shows, run, tree, thunderst...","[terrify, clip, show, run, tree, thunderstorm,..."
4,U.S. Climate Scientists Flee For France To ‚ÄòMa...,"[climate, scientists, flee, france, planet, gr...","[climate, scientist, flee, france, planet, gre..."
...,...,...,...
2173,Treating a World Without Antibiotics? - Becaus...,"[treating, world, antibiotics, overuse, antibi...","[treat, world, antibiotic, overuse, antibiotic..."
2174,Russian Cargo Ship Docks At International Spac...,"[russian, cargo, ship, docks, international, s...","[russian, cargo, ship, dock, international, sp..."
2175,"Robots Play Catch, Starring Agile Justin And R...","[robots, play, catch, starring, agile, justin,...","[robot, play, catch, star, agile, justin, roll..."
2176,Thomas Edison Voted Most Iconic Inventor In U....,"[thomas, edison, voted, iconic, inventor, hist...","[thoma, edison, vote, iconic, inventor, histor..."


In [28]:
vocab = Dictionary(stories_df['lemma'])
print(vocab.token2id)



  and should_run_async(code)


In [29]:
# convert corpus to BoW format
corpus = [vocab.doc2bow(terms) for terms in stories_df['lemma']]

  and should_run_async(code)


In [39]:
# create an lda model from our corpus and vocab - we need to specify the number of topics
lda_model = LdaModel(corpus=corpus, id2word=vocab, num_topics=4)

  and should_run_async(code)


In [40]:
# view the topics in the model
for topic in lda_model.show_topics(num_topics=4,num_words=15):
    print("Topic "+str(topic[0])+"\n"+topic[1]+"\n")

Topic 0
0.014*"space" + 0.011*"video" + 0.009*"nasa" + 0.008*"photo" + 0.007*"earth" + 0.007*"new" + 0.006*"planet" + 0.006*"science" + 0.006*"scientist" + 0.005*"study" + 0.005*"like" + 0.004*"way" + 0.004*"asteroid" + 0.004*"light" + 0.004*"moon"

Topic 1
0.011*"study" + 0.010*"space" + 0.010*"planet" + 0.009*"scientist" + 0.008*"show" + 0.008*"new" + 0.007*"science" + 0.007*"nasa" + 0.006*"publish" + 0.006*"earth" + 0.006*"com" + 0.006*"follow" + 0.005*"star" + 0.005*"edt" + 0.005*"moon"

Topic 2
0.016*"video" + 0.011*"new" + 0.008*"study" + 0.007*"scientist" + 0.007*"show" + 0.006*"time" + 0.006*"robot" + 0.005*"research" + 0.005*"venu" + 0.005*"science" + 0.005*"higg" + 0.005*"human" + 0.005*"space" + 0.005*"aurora" + 0.005*"world"

Topic 3
0.012*"space" + 0.009*"say" + 0.008*"science" + 0.006*"day" + 0.006*"scientist" + 0.005*"year" + 0.005*"photo" + 0.005*"shuttle" + 0.004*"nasa" + 0.004*"life" + 0.004*"mar" + 0.004*"little" + 0.003*"fool" + 0.003*"astronaut" + 0.003*"world"



  and should_run_async(code)


In [41]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, vocab)
vis

  and should_run_async(code)
