In [25]:
import json
import gensim
from gensim.utils import simple_preprocess

data = json.load(open('example1.json'))['res']

In [26]:
words = [
    simple_preprocess(document) \
    for document in data
]

In [27]:
bigrams = gensim.models.Phrases(words,min_count=5,threshold=100)
trigrams = gensim.models.Phrases(bigrams[words],threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigrams)
trigram_mod = gensim.models.phrases.Phraser(trigrams)

In [28]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/will/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [30]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [31]:
data_lemmatized

[['cache_invalidation',
  'really',
  'hard',
  'problem',
  'computer',
  'science',
  'time',
  'recommend',
  'read',
  'original',
  'article',
  'https',
  'netflixtechblog',
  'com',
  'see',
  'hardware',
  'counter',
  'journey',
  'threefold',
  'performance',
  'increase',
  'hard',
  'problem',
  'error',
  'cache_invalidation',
  'question',
  'supertype',
  'cache',
  'share',
  'thread',
  'seem',
  'ideal',
  'candidate',
  'make',
  'thread',
  'local',
  'get',
  'rid',
  'cache_invalidation',
  'problem',
  'hard',
  'cache_invalidation',
  'name',
  'thing',
  'wonder',
  'much',
  'problem',
  'mitigate',
  'large',
  'cache',
  'work',
  'high',
  'level',
  'server',
  'slow',
  'remote',
  'block',
  'storage',
  'service',
  'several',
  'terabyte',
  'datum',
  'store',
  'expect',
  'ssd',
  'really',
  'difficult',
  'avoid',
  'cache',
  'miss',
  'cache',
  'large',
  'err',
  'side',
  'cache',
  'datum',
  'need',
  'cache',
  'quite',
  'easy',
  'manage

In [8]:


import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [23]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=6, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [24]:
lda_model.print_topics()

[(0,
  '0.001*"use" + 0.001*"cart" + 0.001*"make" + 0.001*"cache" + 0.001*"go" + 0.001*"get" + 0.001*"well" + 0.001*"try" + 0.001*"thread" + 0.001*"problem"'),
 (1,
  '0.001*"problem" + 0.001*"use" + 0.001*"cart" + 0.001*"cache" + 0.001*"make" + 0.001*"thread" + 0.001*"get" + 0.001*"well" + 0.001*"go" + 0.001*"https"'),
 (2,
  '0.015*"food" + 0.014*"https" + 0.012*"go" + 0.012*"good" + 0.012*"try" + 0.012*"com" + 0.010*"get" + 0.010*"escondido" + 0.009*"yellow" + 0.009*"deli"'),
 (3,
  '0.073*"problem" + 0.020*"com" + 0.014*"solve" + 0.014*"list" + 0.014*"island" + 0.014*"graph" + 0.012*"number" + 0.012*"link" + 0.010*"work" + 0.008*"love"'),
 (4,
  '0.001*"use" + 0.001*"problem" + 0.001*"cache" + 0.001*"make" + 0.001*"cart" + 0.001*"go" + 0.001*"well" + 0.001*"get" + 0.001*"time" + 0.001*"terpene"'),
 (5,
  '0.026*"cart" + 0.023*"use" + 0.017*"get" + 0.016*"make" + 0.012*"terpene" + 0.010*"go" + 0.009*"gram" + 0.008*"good" + 0.008*"well" + 0.008*"try"'),
 (6,
  '0.022*"cache" + 0.018*

In [48]:
from gensim.models import CoherenceModel
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [49]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

100%|██████████| 540/540 [49:23<00:00,  5.59s/it]

NameError: name 'pd' is not defined

In [51]:
import pandas as pd
pd.DataFrame(model_results).to_csv('./lda_tuning_results.csv', index=False)




In [55]:
df = pd.DataFrame(model_results)
df.sort_values('Coherence',ascending=False)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
148,75% Corpus,6,asymmetric,0.91,0.528213
478,100% Corpus,8,asymmetric,0.91,0.496492
248,75% Corpus,10,0.31,0.91,0.496396
263,75% Corpus,10,symmetric,0.91,0.486931
208,75% Corpus,8,asymmetric,0.91,0.482464
...,...,...,...,...,...
35,75% Corpus,3,0.31,0.01,0.247398
40,75% Corpus,3,0.61,0.01,0.246355
50,75% Corpus,3,symmetric,0.01,0.246355
45,75% Corpus,3,0.91,0.01,0.243320


In [56]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis


  from imp import reload


ModuleNotFoundError: No module named 'pyLDAvis.gensim'

In [32]:
import gsdmm

mgp = gsdmm.MovieGroupProcess(K=10, alpha=0.1, beta=.01, n_iters=30)
tot = set(x for d in texts for x in d)

mgp.fit(texts,len(tot))

def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(
            mgp.cluster_word_distribution[cluster].items(),
            key=lambda k: k[1],
            reverse=True,
        )[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — — ')

doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)


# Show the top 10 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 10)

In stage 0: transferred 9 clusters with 3 clusters populated
In stage 1: transferred 3 clusters with 3 clusters populated
In stage 2: transferred 2 clusters with 3 clusters populated
In stage 3: transferred 3 clusters with 3 clusters populated
In stage 4: transferred 3 clusters with 3 clusters populated
In stage 5: transferred 3 clusters with 3 clusters populated
In stage 6: transferred 2 clusters with 3 clusters populated
In stage 7: transferred 1 clusters with 3 clusters populated
In stage 8: transferred 3 clusters with 3 clusters populated
In stage 9: transferred 3 clusters with 3 clusters populated
In stage 10: transferred 3 clusters with 3 clusters populated
In stage 11: transferred 3 clusters with 3 clusters populated
In stage 12: transferred 3 clusters with 3 clusters populated
In stage 13: transferred 3 clusters with 3 clusters populated
In stage 14: transferred 3 clusters with 3 clusters populated
In stage 15: transferred 3 clusters with 3 clusters populated
In stage 16: trans

NameError: name 'np' is not defined

In [20]:
import pickle
pickle.dump(data_lemmatized,open('lemm_data.pkl','wb+'))