In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import time

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

import pyLDAvis

### Data Ingestion

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('coherence_vars20.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - df["final_frqwds_removed"]

In [3]:
# input needed for LDA, NMF and LSA (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for abstract in docs:
    text.append(" ".join(abstract))

### NMF

In [4]:
# used for NMF

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [5]:
# create model

num_topics = 5

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 0)
doc_topic_dist = nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model time: {t2-t1}")

  Model time: 84.02853178977966


### Print Results

In [6]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [7]:
print_topics(nmf_model, tfidf_vectorizer, 10)


Topic 0:
('cell', 6.9915583586220755)
('protein', 2.6336031174852836)
('gene', 2.0723343248964556)
('mouse', 1.7153221969646284)
('human', 1.2583518953120105)
('immune', 1.0954853045654442)
('dna', 1.0646155957095604)
('rna', 1.0410576496628265)
('receptor', 1.0303559897508985)
('tumor', 1.0030853450081831)

Topic 1:
('student', 3.616743843339292)
('science', 1.7502078072763467)
('faculty', 1.1919964662700873)
('graduate', 1.1111207712856004)
('career', 0.9677280034470301)
('school', 0.9666707000042173)
('undergraduate', 0.9543603089249312)
('engineering', 0.9502783611690483)
('education', 0.8921940748259414)
('university', 0.7950440350077744)

Topic 2:
('cancer', 6.148654072687778)
('tumor', 1.8757266040929046)
('breast', 1.7013216122057029)
('prostate', 1.1314854007146335)
('clinical', 1.0137939182891083)
('patient', 0.8543946757724442)
('lung', 0.5614307402284137)
('therapy', 0.5584461366038792)
('member', 0.4826841009291323)
('spore', 0.43750758316665106)

Topic 3:
('health', 1.94

### Us pyLDAvis to visualize results

In [8]:
topic_term_dist = nmf_model.components_
doc_len = docs.apply(len)
words = tfidf_vectorizer.get_feature_names()

In [9]:
# term_frequency is the number of times each term appears in the corpus

vectorizer = CountVectorizer(max_df=0.6, min_df=20, lowercase=False, max_features=int(len(docs)/2))
dtm = vectorizer.fit_transform(text)

# tfidf and count vectorizers produce same term lists 
#words == vectorizer.get_feature_names()  # True

In [10]:
# sum the columns of the doc-term matrix to get term_frequencies in corpus

term_freq = dtm.sum(axis=0).tolist()[0]

In [45]:
vis_data = pyLDAvis.prepare(topic_term_dists = topic_term_dist, doc_topic_dists = temp, 
                 doc_lengths = doc_len, vocab = words, term_frequency = term_freq, 
                 R=30, n_jobs=-1, sort_topics=False)

ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.

In [None]:
pyLDAvis.display(vis_data)

In [22]:
row_sums = np.array(doc_topic_dist.sum(axis=1))
row_sums.shape

(690814,)

In [24]:
row_sums

array([0.01248707, 0.01357924, 0.01195322, ..., 0.03435767, 0.02243898,
       0.01931037])

In [41]:
temp = doc_topic_dist / row_sums.reshape(-1,1)

In [37]:
sum(row_sums < 0.2)

690814

In [38]:
row_sums[row_sums == 0] = 1

In [40]:
sum(row_sums == 1)

21

In [42]:
temp[0]

array([0.18635153, 0.        , 0.07309583, 0.71103431, 0.02951833])

In [43]:
sum(temp[0])

1.0

In [44]:
doc_topic_dist[0]

array([0.00232698, 0.        , 0.00091275, 0.00887873, 0.0003686 ])

In [46]:
s = temp.sum(axis=1)

In [47]:
sum(s == 1)

535574

In [48]:
sum(s != 1)

155240

In [50]:
temp2 = s[s != 1]

In [54]:
temp2[3]

1.0000000000000002

In [1]:
# NMF is not probabilistic -- LDAvis is based on a probabilistic model, so NMF in LDAvis will not work