In [1]:

from multiprocessing import Pool
from tqdm import tqdm
from spacy.lang.en import English
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import multiprocessing as mp
import json
import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
#Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
def tokenize(doc):
    tokens = doc.split(" ")
    tokens = [word for word in tokens if len(word.strip()) > 0]
    return tokens

print("Loading tech corpus")
with open("../data/tech_review_word_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
reviews = pd.DataFrame(tech_review_corpus).review.tolist()
print("Tokenize the corpus")

with open(( "../data/stop_words.json"), "r") as f:
    stop_words = json.load(f)

vectorizer = CountVectorizer(min_df=3, max_df=.90, tokenizer=tokenize, stop_words=stop_words, ngram_range=(1, 2))



Loading tech corpus
Tokenize the corpus


In [3]:
X = vectorizer.fit_transform(reviews)
print("Total Vocab Size", len(vectorizer.vocabulary_))

Total Vocab Size 56590


In [5]:
result = []
if __name__ == "__main__":
    topics_range = range(15, 6, -1)
    alpha = list(np.arange(0.01, 1, 0.3))
    #alpha.append(None)
    beta = list(np.arange(0.01, 1, 0.3))
    #beta.append(None)

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                    "k": k, "alpha": a, "beta": b
                })

    print("Total parameter values to train", len(parameters))

    for param in tqdm(parameters):
        lda = LatentDirichletAllocation(
                learning_method="batch",
                random_state=100,
                n_components=param["k"],
                doc_topic_prior=param["alpha"],
                topic_word_prior=param["beta"],
                n_jobs=-2
            )
        lda.fit(X)
        score = metric_coherence_gensim(measure='u_mass', 
                        top_n=50, 
                        topic_word_distrib=lda.components_, 
                        dtm=X, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]),
                        return_mean=True)
        result.append({
        "k":param["k"],
        "alpha":param["alpha"],
        "beta":param["beta"],
        "score":score
    })
            
    print("Done!")


  0%|                                                                                          | 0/144 [00:00<?, ?it/s]

Total parameter values to train 144


100%|█████████████████████████████████████████████████████████████████████████████| 144/144 [7:44:31<00:00, 193.55s/it]

Done!





In [6]:
pd.DataFrame(result).to_csv("../data/word_lda_umass.csv")

In [12]:
pd.DataFrame(result).sort_values(by=['score','k'],ascending=False)

Unnamed: 0,k,alpha,beta,score
129,7,0.01,0.31,-2.101103
130,7,0.01,0.61,-2.104211
128,7,0.01,0.01,-2.155435
131,7,0.01,0.91,-2.210235
81,10,0.01,0.31,-2.235606
...,...,...,...,...
19,14,0.01,0.91,-5.391617
11,15,0.61,0.91,-5.395929
7,15,0.31,0.91,-5.644338
35,13,0.01,0.91,-5.715389


### NMF

In [7]:
topics_range = range(15, 6, -1)
alpha = list(np.arange(0.01, 1, 0.3))
#alpha.append(None)
#beta = list(np.arange(0.01, 1, 0.3))
#beta.append(None)

parameters = []
for k in topics_range:
    for a in alpha:
        parameters.append({
                    "k": k, "alpha": a
                })


In [8]:
nmf_result = []
for param in tqdm(parameters):
    nmf = NMF(n_components=param["k"]
                  ,init='nndsvd'
                 ,random_state=100,
                 alpha=param["alpha"])

    nmf.fit(X)
    score = metric_coherence_gensim(measure='u_mass', 
                        top_n=100, 
                        topic_word_distrib=nmf.components_, 
                        dtm=X, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]),
                        return_mean=True)
    
    nmf_result.append({
    "k":param["k"],
    "alpha":param["alpha"],
    "score":score
    })
            
print("Done!")

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [22:12<00:00, 37.01s/it]

Done!





In [9]:
pd.DataFrame(nmf_result).to_csv("../data/word_nmf_umass.csv")

In [13]:
pd.DataFrame(nmf_result).sort_values(by=['score','k'],ascending=False)

Unnamed: 0,k,alpha,score
34,7,0.61,-2.463561
35,7,0.91,-2.467459
32,7,0.01,-2.475708
33,7,0.31,-2.487996
29,8,0.31,-2.511817
30,8,0.61,-2.525201
31,8,0.91,-2.533713
28,8,0.01,-2.53492
26,9,0.61,-2.571295
27,9,0.91,-2.577505


In [10]:
from gensim.corpora.dictionary import Dictionary
def vect2gensim(vectorizer, dtmatrix):
     # transform sparse matrix into gensim corpus and dictionary
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus_vect_gensim,
        id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

    return (corpus_vect_gensim, dictionary)
corpus, id2word = vect2gensim(vectorizer, X)

In [15]:
def compute_coherence_values(param):
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=param["k"], 
                                           random_state=100,
                                           chunksize=1000,
                                           workers = 6,
                                           passes=10,
                                           alpha=param["alpha"],
                                           eta=param["beta"],
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='u_mass')
    
    param["coherence"] = coherence_model_lda.get_coherence()

    return param
if __name__ == "__main__":
    processed_docs = list(tqdm(map(tokenize, reviews), total=len(reviews)))
    grid = {}
    grid['Validation_Set'] = {}
    # Topics range
    min_topics = 7
    max_topics = 15
    step_size = 1
    topics_range = range(max_topics, min_topics, -1)
    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                        "k":k
                        ,"alpha":a
                        ,"beta":b
                        ,"workers":4
                    })

    print("Running modeling")
    print("Total Paramters", len(parameters))

    results = list(map(compute_coherence_values, tqdm(parameters)))
    
    pd.DataFrame(results).to_csv("../Data/word_gensim_umass.csv",index = False)

100%|█████████████████████████████████████████████████████████████████████████| 45251/45251 [00:00<00:00, 55582.03it/s]
  0%|                                                                                          | 0/240 [00:00<?, ?it/s]

Running modeling
Total Paramters 240


100%|████████████████████████████████████████████████████████████████████████████| 240/240 [44:50:01<00:00, 672.50s/it]


In [21]:
pd.DataFrame(results).sort_values(by=['coherence','k'],ascending=False)

Unnamed: 0,k,alpha,beta,workers,coherence
88,13,asymmetric,0.91,4,-4.008926
58,14,asymmetric,0.91,4,-4.134551
233,8,symmetric,0.91,4,-4.239821
224,8,0.61,symmetric,4,-4.273529
232,8,symmetric,0.61,4,-4.364422
...,...,...,...,...,...
43,14,0.61,0.91,4,-6.831389
138,11,0.91,0.91,4,-6.839150
38,14,0.31,0.91,4,-7.155207
68,13,0.31,0.91,4,-7.174638
