In [1]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
import pickle
from multiprocessing import Pool
from tqdm import tqdm
import multiprocessing as mp
import sys
import os
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

In [13]:


#project_path = os.path.join(os.path.dirname(__file__), "..")
#print(project_path)
num_cpus = mp.cpu_count()


def tokenize(doc):
    tokens = doc.split(" ")
    tokens = [word for word in tokens if len(word.strip()) > 0]
    return tokens

result = []
if __name__ == "__main__":

    print("Loading tech corpus")
    with open( "../data/tech_review_sent_corpus.pkl", "rb") as f:
        tech_review_corpus = pickle.load(f)
    reviews = pd.DataFrame(tech_review_corpus).review.tolist()

    print("Total workers:", num_cpus)

    print("Tokenize the corpus")

    with open("../data/stop_words.json", "r") as f:
        stop_words = json.load(f)

    vectorizer = CountVectorizer(
        min_df=3, max_df=.90, tokenizer=tokenize, stop_words=stop_words, ngram_range=(1, 2))
    X = vectorizer.fit_transform(reviews)
    print("Total Vocab Size", len(vectorizer.vocabulary_))

    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vectorizer.vocabulary_.items()]
    print(sorted(words_freq, key=lambda x: x[1], reverse=True)[:50])

    topics_range = range(15, 6, -1)
    alpha = list(np.arange(0.01, 1, 0.3))
    #alpha.append(None)
    beta = list(np.arange(0.01, 1, 0.3))
    #beta.append(None)

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                    "k": k, "alpha": a, "beta": b
                })

    print("Total parameter values to train", len(parameters))

    for param in tqdm(parameters):
        lda = LatentDirichletAllocation(
                learning_method="batch",
                random_state=100,
                n_components=param["k"],
                doc_topic_prior=param["alpha"],
                topic_word_prior=param["beta"],
                n_jobs=-2
            )
        lda.fit(X)
        score = metric_coherence_gensim(measure='u_mass', 
                        top_n=25, 
                        topic_word_distrib=lda.components_, 
                        dtm=X, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]),
                        return_mean=True)
        result.append({
        "k":param["k"],
        "alpha":param["alpha"],
        "beta":param["beta"],
        "score":score
    })
            
    print("Done!")


Loading tech corpus
Total workers: 8
Tokenize the corpus


  0%|                                                                                          | 0/225 [00:00<?, ?it/s]

Total Vocab Size 51828
[('work', 61850), ('good', 30138), ('wa', 17335), ('learn', 16445), ('management', 13353), ('place', 12317), ('great', 12001), ('job', 10333), ('employee', 8264), ('lot', 8210), ('environment', 7966), ('project', 7571), ('people', 7264), ('time', 7242), ('culture', 7189), ('life', 7123), ('opportunity', 6678), ('technology', 6068), ('part', 5862), ('balance', 5563), ('manager', 5515), ('place work', 5469), ('get', 5448), ('experience', 5350), ('good work', 5228), ('enjoy', 5183), ('work life', 5138), ('life balance', 4801), ('thing', 4389), ('year', 4388), ('ha', 4194), ('provide', 4187), ('many', 4104), ('like', 4060), ('one', 3966), ('benefit', 3892), ('train', 3857), ('also', 3852), ('skill', 3842), ('hard', 3771), ('work culture', 3552), ('well', 3422), ('co', 3409), ('friendly', 3385), ('support', 3357), ('fun', 3234), ('client', 3197), ('work environment', 3196), ('development', 3123), ('would', 3118)]
Total parameter values to train 225


100%|████████████████████████████████████████████████████████████████████████████| 225/225 [19:37:23<00:00, 313.97s/it]

Done!





In [14]:
import pandas as pd

pd.DataFrame(result).to_csv("../data/lda_umass.csv")

In [58]:
result.sort_values(by=['score','k'],ascending=False)

Unnamed: 0,k,alpha,beta,score
200,7,0.01,0.01,-3.448244
202,7,0.01,0.61,-3.451348
204,7,0.01,,-3.472829
203,7,0.01,0.91,-3.476927
201,7,0.01,0.31,-3.495599
...,...,...,...,...
63,13,0.61,0.91,-4.360425
12,15,0.61,0.61,-4.378900
7,15,0.31,0.61,-4.417690
8,15,0.31,0.91,-4.519471


### NFM

In [4]:
topics_range = range(15, 6, -1)
alpha = list(np.arange(0.01, 1, 0.3))
#alpha.append(None)
#beta = list(np.arange(0.01, 1, 0.3))
#beta.append(None)

parameters = []
for k in topics_range:
    for a in alpha:
        parameters.append({
                    "k": k, "alpha": a
                })


In [6]:
from sklearn.decomposition import NMF
nmf_result = []
for param in tqdm(parameters):
    nmf = NMF(n_components=param["k"]
                  ,init='nndsvd'
                 ,random_state=100,
                 alpha=param["alpha"])

    nmf.fit(X)
    score = metric_coherence_gensim(measure='u_mass', 
                        top_n=100, 
                        topic_word_distrib=nmf.components_, 
                        dtm=X, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]),
                        return_mean=True)
    
    nmf_result.append({
    "k":param["k"],
    "alpha":param["alpha"],
    "score":score
    })
            
print("Done!")


100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [33:03<00:00, 55.10s/it]

Done!





In [7]:
pd.DataFrame(nmf_result).to_csv("../data/nmf_umass.csv")

In [8]:
nmf_result = pd.DataFrame(nmf_result)

In [9]:
nmf_result.sort_values(by=['score','k'],ascending=False)

Unnamed: 0,k,alpha,score
32,7,0.01,-3.623963
33,7,0.31,-3.623963
35,7,0.91,-3.624224
34,7,0.61,-3.626523
29,8,0.31,-3.696048
31,8,0.91,-3.698922
30,8,0.61,-3.699271
28,8,0.01,-3.722513
24,9,0.01,-3.865992
25,9,0.31,-3.865992


### GENSIM

In [1]:
from multiprocessing import Pool
from tqdm import tqdm
from spacy.lang.en import English
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import multiprocessing as mp

import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()

# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
num_cpus = mp.cpu_count() - 1

parser = English()

with open("../Data/tech_review_sent_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
reviews = pd.DataFrame(tech_review_corpus).review.tolist()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


In [3]:
def compute_coherence_values(param):
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=param["k"], 
                                           random_state=100,
                                           chunksize=1000,
                                           workers = num_cpus,
                                           passes=10,
                                           alpha=param["alpha"],
                                           eta=param["beta"],
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='u_mass')
    
    param["coherence"] = coherence_model_lda.get_coherence()

    return param

if __name__ == "__main__":

    print("Total workers:", num_cpus)

    print("Tokenize the corpus")
    #with Pool() as p:
    processed_docs = list(tqdm(map(tokenize, reviews), total=len(reviews)))

    # Create Dictionary
    id2word = corpora.Dictionary(processed_docs)
    # Term Document Frequency
    print("Create a Dictionary")
    corpus = [id2word.doc2bow(text) for i, text in tqdm(enumerate(processed_docs), total=len(processed_docs))]

    grid = {}
    grid['Validation_Set'] = {}
    # Topics range
    min_topics = 6
    max_topics = 15
    step_size = 1
    topics_range = range(max_topics, min_topics, -1)
    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                        "k":k
                        ,"alpha":a
                        ,"beta":b
                        ,"workers":4
                    })

    print("Running modeling")
    print("Total Paramters", len(parameters))

    results = list(map(compute_coherence_values, tqdm(parameters)))
    
    gensim_umass = pd.DataFrame(results).to_csv("../Data/gensim_umass.csv",index = False)

  0%|▏                                                                          | 325/156988 [00:00<00:48, 3219.12it/s]

Total workers: 7
Tokenize the corpus


100%|███████████████████████████████████████████████████████████████████████| 156988/156988 [00:09<00:00, 15890.33it/s]
  4%|██▋                                                                      | 5897/156988 [00:00<00:02, 58400.85it/s]

Create a Dictionary


100%|███████████████████████████████████████████████████████████████████████| 156988/156988 [00:02<00:00, 54435.28it/s]
  0%|                                                                                          | 0/300 [00:00<?, ?it/s]

Running modeling
Total Paramters 300


100%|████████████████████████████████████████████████████████████████████████████| 300/300 [29:09:34<00:00, 349.92s/it]


In [11]:
gensim_umass.sort_values(by=['coherence'],ascending=False)

Unnamed: 0,k,alpha,beta,workers,coherence
212,8,0.01,0.61,4,-2.325531
241,7,0.01,0.31,4,-2.334568
244,7,0.01,symmetric,4,-2.353571
240,7,0.01,0.01,4,-2.359256
242,7,0.01,0.61,4,-2.384681
...,...,...,...,...,...
83,13,symmetric,0.91,4,-5.702495
118,12,asymmetric,0.91,4,-5.882181
58,14,asymmetric,0.91,4,-7.272288
88,13,asymmetric,0.91,4,-7.395710
