In [45]:
#import packages

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import json
import ast
import nltk
#nltk.download()
import string
from nltk.corpus import stopwords 
from nltk.corpus import wordnet 
from nltk.stem import WordNetLemmatizer 
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from pprint import pprint
from sklearn.model_selection import GridSearchCV

In [46]:
#load dataset
df_all=pd.read_csv('wikihowAll.csv')
df_all.reset_index(drop=True, inplace=True)
#drop rows with null values (0.4% of the datset)
df_all=df_all.dropna(axis=0)

# Data Exploration and Statistics

In [47]:
# data exploration: find out token situation
def get_stats(column):
    
    #simple tokenization and processing
    def tok(text):
        #lower case
        text=text.lower()
        #remove punctuation
        text = text.translate(str.maketrans('','',string.punctuation))
        text = text.translate(str.maketrans('','','1234567890'))
        text = re.sub("[^a-zA-Z]+", " ", text)
        #tokenisation
        tokens = nltk.word_tokenize(text)
        return tokens
    tokens=column.apply(lambda x: tok(x))
    len_list=[]
    for x in tokens:
        length=len(x)
        len_list.append(length)
    avg=np.mean(len_list)
    max_l=np.max(len_list)
    min_l=np.min(len_list)
    return("Average:%d, Max:%d, Min%d"&(avg,max_l,min_l))
#get_stats(df_all['text'])
#get_stats(df_all['headline'])

# Preprocessing

In [48]:
#complete preprocessing used for topic modelling
def Text_Processing(text):
    #lower case
    text=text.lower()
    #remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    text = text.translate(str.maketrans('','','1234567890'))
    text = re.sub("[^a-zA-Z]+", " ", text)
    #tokenisation
    tokens = nltk.word_tokenize(text)
    #stop words
    stop_list = set(stopwords.words('english')) 
    filtered_tokens=[word for word in tokens if word not in stop_list]
    #lemmatisation
    wnl = WordNetLemmatizer()
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN    
    def lemmatize_with_pos(token):
        pos = nltk.pos_tag(token)
        lemm_words = [wnl.lemmatize(sw[0], get_wordnet_pos(sw[1])) for sw in pos]
        return lemm_words
    final_token=lemmatize_with_pos(filtered_tokens)
    return final_token



In [None]:
corpus=df_all['text']

no_features = 1000

#create docuement-word matrix

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df = 0.99,min_df = 0.01,tokenizer = Text_Processing,max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
cv_vectorizer = CountVectorizer(max_df = 0.99,min_df = 0.01,tokenizer = Text_Processing, max_features=no_features)
cv = tf_vectorizer.fit_transform(corpus)
cv_feature_names = tf_vectorizer.get_feature_names()

# Topic Modelling

In [None]:
#run topic modelling


# Run NMF
#nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
#lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(cv)


In [None]:
#Find best para for topic modelling

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(cv)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(cv))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [10, 15, 20, 25, 30]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic%d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [28]:
no_top_words = 10
#display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(best_lda_model, cv_feature_names, no_top_words)

Topic0:
like say ask talk conversation know friend something make tell
Topic1:
cat toy treat breed breeder may play dog food get
Topic2:
hamster cage food hand treat dwarf may vet wheel bite
Topic3:
pig guinea cage vet hay vitamin c treat vegetable pellet
Topic4:
horse bridle halter bit rope hoof mouth tie strap head
Topic5:
crush person someone feeling people youre friend around dance see
Topic6:
litter box use cat scoop clean bathroom waste urine may
Topic7:
relationship friend feel feeling person time ex people thing may
Topic8:
goat milk hoof kid fence trim feed foot mother baby
Topic9:
kiss lip eye mouth want move touch close partner first
Topic10:
cage pet use water make clean bed sure food toy
Topic11:
coat color tail breed ear eye pattern body round fur
Topic12:
saddle leather soap stirrup oil pad inch bar dry back
Topic13:
dont youre guy he doesnt hell thing youll make think
Topic14:
girl shes like guy look may woman show shell flirt
Topic0:
time make relationship feel want fr