* This is a demo for performing topic modeling on text data 

* Data is downloaded from https://archive.ics.uci.edu/ml/datasets/bag+of+words. I process and save them into sparse matrix. 

In [1]:
from scipy import sparse
from scipy.io import mmread
import numpy as np
import time
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = "/project/mstephens/zihao/text_dataset/"
data_name = "nips"
count_name = "docword.{}.mtx".format(data_name)
vocab_name = "vocab.{}.txt".format(data_name)

## load counts and vocab

In [3]:
print("load counts")
start = time.time()
counts = mmread("{}{}".format(data_dir, count_name))
runtime = time.time() - start
print("data loaded after {:.3}".format(runtime))
(n,p) = counts.shape
print("data shape: n_sample= {}; n_feature={}".format(n, p))
nnzero = len(np.nonzero(counts)[0])
print("percentage of nonzero: {:.3}\n".format(nnzero/(n*p)))

## load vocab
print("load vocab")
vocab = []
with open("{}{}".format(data_dir, vocab_name), "r") as f:
    for line in f:
        vocab.append(line.split()[0])
print("vocab size :{}".format(len(vocab)))

load counts
data loaded after 1.24
data shape: n_sample= 1500; n_feature=12419
percentage of nonzero: 0.0401

load vocab
vocab size :12419


## Fitting data with NMF

In [4]:
K = 5
model = NMF(n_components= K, init = "nndsvda", beta_loss="kullback-leibler",solver = "mu" ,max_iter=10)

start = time.time()
model.fit(counts)
L = model.transform(counts)
F = model.components_.T
runtime = time.time() - start
print("fit finished after {:.3} seconds".format(runtime))

def poisson2multinom (F, L):
    Lnew = L * np.sum(F,0)
    s    = np.sum(Lnew,1)
    Lnew = (Lnew.T / s).T
    Fnew = F / np.sum(F,0)
    return Fnew, Lnew

print("turn poisson model to multinomial")
F,L = poisson2multinom(F,L)

fit finished after 5.03 seconds
turn poisson model to multinomial


## Displaying topics

In [5]:
for k in range(F.shape[1]):
    n_top = 10
    ind_decr = np.argsort(F[:,k])[::-1]
    print("top words for topic {}:".format(k+1))
    for i in ind_decr[:n_top]:
        print("{}   :   {:.4}".format(vocab[i], F[i,k]))
    print("\n")

top words for topic 1:
network   :   0.02581
neural   :   0.0113
input   :   0.01067
training   :   0.01013
set   :   0.01009
function   :   0.0089
unit   :   0.008733
error   :   0.008581
output   :   0.007797
weight   :   0.007779


top words for topic 2:
model   :   0.02404
data   :   0.01633
algorithm   :   0.01607
distribution   :   0.0102
function   :   0.008665
parameter   :   0.008155
method   :   0.007298
gaussian   :   0.006611
probability   :   0.006529
set   :   0.005862


top words for topic 3:
model   :   0.015
cell   :   0.01354
neuron   :   0.01255
input   :   0.009278
visual   :   0.008689
system   :   0.006589
signal   :   0.00641
response   :   0.005726
activity   :   0.005607
field   :   0.00554


top words for topic 4:
learning   :   0.02693
function   :   0.01956
neuron   :   0.01434
action   :   0.009677
circuit   :   0.00891
optimal   :   0.007701
chip   :   0.00739
algorithm   :   0.007196
current   :   0.007131
policy   :   0.007113


top words for topic 5:
le