## Topic Modelling using LDA

In [111]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  pd.set_option('display.max_colwidth', -1)


In [22]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
nltk.download("stopwords")

stop_words = stopwords.words("english")
stop_words += ['from', 'subject', 're', 'edu', 'use']

### Import the 20-Newsgroups dataset

In [11]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df['target_names'].unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


### Data Cleaning
- Remove noise
- Tokenization and removing punctuation using gensim.utils.simple_process
- remove stopwords and words with length <= 2
- lemmatization

In [15]:
data = df['content'].values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [18]:
#use gensim simple_process to tokenize words and remove punctuations
def tokenize(list_sentence):
    tokenized = []
    for sentence in list_sentence:
        tokenized.append(simple_preprocess(sentence, deacc=True))
    return tokenized

tokenized_data = tokenize(data)
tokenized_data[0][:10]

['from',
 'wheres',
 'my',
 'thing',
 'subject',
 'what',
 'car',
 'is',
 'this',
 'nntp']

In [62]:
def remove_stopwords(tokenized_data):
    token_no_stop = []
    for doc in tokenized_data:
        token_no_stop.append([word for word in doc if word not in stop_words and len(word)>2])
    return token_no_stop

def lemmatization(tokens):
    for pos in ['v', 'n', 'a']:
        for idx, doc in enumerate(tokens):
            tokens[idx] = [WordNetLemmatizer().lemmatize(word, pos=pos) for word in doc]
    return tokens

clean_data = lemmatization(remove_stopwords(tokenized_data))
print(clean_data[0])

['wheres', 'thing', 'car', 'nntp', 'post', 'host', 'rac', 'wam', 'umd', 'organization', 'university', 'maryland', 'college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'whatever', 'info', 'funky', 'look', 'car', 'please', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']


### Create dictionary and Bag of Words
- dictionary: mapping of integers to words
- Bag of Words: the term frequency of each word in documents

In [63]:
id2word = corpora.Dictionary(clean_data)
id2word.filter_extremes(no_below=15, no_above=0.1)
#Bag of Words
corpus = [id2word.doc2bow(data) for data in clean_data]
corpus[0][:10]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 5),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1)]

In [65]:
#human readable version (term frequency)
doc1 = corpus[0]
term_freq = [(id2word[k], v) for k, v in doc1]
term_freq[:10]

[('addition', 1),
 ('body', 1),
 ('bring', 1),
 ('bumper', 1),
 ('car', 5),
 ('college', 1),
 ('door', 2),
 ('early', 1),
 ('engine', 1),
 ('enlighten', 1)]

### Building the topic model

In [66]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           per_word_topics=True)

In [67]:
pprint(lda_model.print_topics(-1))

[(0,
  '0.060*"drive" + 0.043*"card" + 0.026*"driver" + 0.021*"cpu" + 0.021*"disk" '
  '+ 0.020*"video" + 0.020*"bite" + 0.020*"memory" + 0.019*"machine" + '
  '0.018*"patient"'),
 (1,
  '0.192*"max" + 0.084*"scsi" + 0.064*"trade" + 0.045*"gary" + 0.041*"pen" + '
  '0.030*"stephen" + 0.027*"captain" + 0.027*"compare" + 0.026*"leaf" + '
  '0.025*"louis"'),
 (2,
  '0.031*"jim" + 0.022*"president" + 0.021*"bill" + 0.021*"national" + '
  '0.021*"press" + 0.021*"health" + 0.019*"group" + 0.018*"publish" + '
  '0.018*"meet" + 0.017*"washington"'),
 (3,
  '0.058*"image" + 0.053*"graphic" + 0.046*"color" + 0.044*"version" + '
  '0.036*"newsreader" + 0.036*"display" + 0.033*"tin" + 0.028*"screen" + '
  '0.025*"sun" + 0.023*"object"'),
 (4,
  '0.033*"program" + 0.017*"information" + 0.015*"send" + 0.015*"entry" + '
  '0.015*"source" + 0.014*"available" + 0.014*"software" + 0.014*"list" + '
  '0.013*"copy" + 0.013*"email"'),
 (5,
  '0.034*"mac" + 0.033*"power" + 0.020*"bus" + 0.017*"cool" + 0.014

In [68]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.489997300009083


In [69]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [109]:
document_topic = pd.DataFrame()
topic_distribution = [doc[0] for doc in lda_model[corpus]]

for i, topics in enumerate(topic_distribution):
    topics = sorted(topics, key = lambda x: x[1], reverse=True)
    topic_num, topic_pct = topics[0][0], topics[0][1]
    topic_keywords = '.'.join([word for word, p in lda_model.show_topic(topic_num)])
    document_topic = document_topic.append(pd.Series([int(topic_num), round(topic_pct, 3), topic_keywords]), ignore_index=True)

contents = df['target_names']
document_topic = pd.concat([document_topic, contents], axis=1)
document_topic.columns = ['topic_num', 'topic_pct', 'topic_keywords', 'target_names']

Unnamed: 0,topic_num,topic_pct,topic_keywords,target_names
0,16.0,0.483,car.price.sale.buy.sell.bike.pin.drive.pay.offer,rec.autos
1,7.0,0.469,bad.anything.little.probably.enough.course.thats.didnt.keep.actually,comp.sys.mac.hardware
2,7.0,0.207,bad.anything.little.probably.enough.course.thats.didnt.keep.actually,comp.sys.mac.hardware
3,0.0,0.393,drive.card.driver.cpu.disk.video.bite.memory.machine.patient,comp.graphics
4,4.0,0.436,program.information.send.entry.source.available.software.list.copy.email,sci.space


### Compare the generated topics (topic_keywords) to the target topics

In [112]:
document_topic.head(10)

Unnamed: 0,topic_num,topic_pct,topic_keywords,target_names
0,16.0,0.483,car.price.sale.buy.sell.bike.pin.drive.pay.offer,rec.autos
1,7.0,0.469,bad.anything.little.probably.enough.course.thats.didnt.keep.actually,comp.sys.mac.hardware
2,7.0,0.207,bad.anything.little.probably.enough.course.thats.didnt.keep.actually,comp.sys.mac.hardware
3,0.0,0.393,drive.card.driver.cpu.disk.video.bite.memory.machine.patient,comp.graphics
4,4.0,0.436,program.information.send.entry.source.available.software.list.copy.email,sci.space
5,11.0,0.674,gun.issue.control.discussion.weapon.person.self.purpose.death.support,talk.politics.guns
6,8.0,0.479,apr.internet.message.gmt.slow.test.bank.brain.group.week,sci.med
7,0.0,0.296,drive.card.driver.cpu.disk.video.bite.memory.machine.patient,comp.sys.ibm.pc.hardware
8,8.0,0.47,apr.internet.message.gmt.slow.test.bank.brain.group.week,comp.os.ms-windows.misc
9,5.0,0.317,mac.power.bus.cool.port.service.supply.mount.build.digital,comp.sys.mac.hardware
