#Non-negative Matrix Factorization (NMF)

- NYT corpus
- NMF vs. SVD 
    - NMF more parallelizable, can handle missing values, can define own loss function, and is restricted to only positive values

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_pickle('data/articles.pkl')

In [4]:
cvectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = cvectorizer.fit_transform(df['content'].values)

In [5]:
feature_words = cvectorizer.get_feature_names()

##My NMF implementation

In [14]:
from numpy.random import rand, RandomState
from numpy import array, matrix, linalg

In [15]:
def reconst_mse(target, left, right):
    return (array(target - left.dot(right))**2).mean()

In [16]:
def my_nmf(document_term_mat, n_components=15, n_iterations=50, eps=1e-6):
    n_rows, n_cols = document_term_mat.shape
    W = rand(n_rows*n_components).reshape([n_rows, n_components])
    H = rand(n_components*n_cols).reshape([n_components, n_cols])
    # linalg.lstsq doesn't work on sparse mats
    dense_document_term_mat = document_term_mat.todense()
    for i in range(n_iterations):
        H = linalg.lstsq(W, dense_document_term_mat)[0].clip(eps)
        W = linalg.lstsq(H.T, dense_document_term_mat.T)[0].clip(eps).T
    return array(W), array(H)

In [17]:
def describe_nmf_results(document_term_mat, W, H, n_top_words = 15):
    print("Reconstruction error: %f") %(reconst_mse(document_term_mat, W, H))
    for topic_num, topic in enumerate(H):
        print("Topic %d:" % topic_num)
        print(" ".join([feature_words[i] \
                for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [18]:
n_topics = 7

In [19]:
my_W, my_H = my_nmf(X, n_components=n_topics, n_iterations=50, eps=1e-6)

In [20]:
describe_nmf_results(X, my_W, my_H)

Reconstruction error: 0.582826
Topic 0:
gun child firearm year death law state said accidental time shooting accident old safe adult
Topic 1:
game season team said year player time yankee league play run win point yard hit
Topic 2:
republican house government health law care party shutdown senate obama president congress democrat federal insurance
Topic 3:
said year government official percent people company group country united state attack american day 000
Topic 4:
new work like year davis company york city people ms woman state time story world
Topic 5:
mr said party like year music case reid court night netanyahu leader rouhani political time
Topic 6:
iran united nuclear rouhani state obama president syria weapon nation iranian israel netanyahu american chemical


##Sklearn's NMF implementation

In [121]:
from sklearn.decomposition import NMF

In [22]:
nmf = NMF(n_components=n_topics, random_state=1)
W = nmf.fit_transform(X)
H = nmf.components_

In [23]:
describe_nmf_results(X, W, H)

Reconstruction error: 0.581763
Topic 0:
mr said like party year case political time leader new music night work court member
Topic 1:
game season team year player said time yankee league play run win point like yard
Topic 2:
republican government house health law care party shutdown president senate obama congress democrat federal vote
Topic 3:
gun child year firearm death said law state accidental time shooting accident old safe adult
Topic 4:
new work year like company york people city ms davis state woman time job world
Topic 5:
said year government official people percent group country day state united 000 company attack month
Topic 6:
iran united nuclear rouhani state obama president nation iranian syria weapon israel netanyahu american country


####The results are similar