In [2]:
%matplotlib inline
import numpy as np
import sklearn.decomposition
import sklearn.feature_extraction
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import bottleneck as bn

In [3]:
# arbitrary 6x3 matrix (constructed from an example at http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html with a random column concatenated)
M = np.concatenate((np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]),map(lambda x : [x], np.round(np.random.random_sample(6),1)+1)),1)
M
H = sklearn.decomposition.NMF(n_components=2).fit(M).components_
W = sklearn.decomposition.NMF(n_components=2).fit_transform(M)
print H
print
print W

[[ 3.07482336  0.45068959  0.77361709]
 [ 0.          0.97165209  1.21900555]]

[[ 0.32248028  0.71847941]
 [ 0.65056118  0.73251539]
 [ 0.9764969   0.82779544]
 [ 1.30330612  0.56177196]
 [ 1.62465347 -0.        ]
 [ 1.95092579  0.09280544]]


In [4]:
# a 20x4 example in which there are explicit types (that are unknown to the algorithm)
n = 20
types = np.trunc(np.random.random_sample(n)*3)
means = [[1,2,3,4],[1,5,4,1],[10,1,2,8]]
stddevs = 0.5 * np.ones([3,4])
def gen(t):
    mean = means[t]
    sd = stddevs[t]
    sz = len(mean)
    return list(np.random.randn(sz) * sd + mean)
# generate a data matrix
M = np.round(map(gen, map(int, types)),3)

M_tfidf = sklearn.feature_extraction.text.TfidfTransformer().fit_transform(M).toarray()
print M
print
print M_tfidf
print
print

# run NMF
fit = sklearn.decomposition.NMF(n_components=3).fit(M)
H = fit.components_
W = fit.transform(M)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)
# use fit from above to classify a new point
print fit.transform([[1,2,4,5]])

print
print

fit = sklearn.decomposition.NMF(n_components=3).fit(M_tfidf)
H = fit.components_
W = fit.transform(M_tfidf)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)

[[  9.765   1.056   2.006   7.364]
 [  1.093   1.709   3.668   3.843]
 [  1.073   2.829   2.908   3.373]
 [ 10.009   0.776   1.864   7.537]
 [  9.836   0.746   2.556   8.405]
 [  9.506   0.247   1.486   7.809]
 [  0.907   2.303   2.934   4.137]
 [  9.924   0.81    1.19    8.159]
 [  0.68    1.667   2.758   3.854]
 [  8.266   0.659   1.633   8.18 ]
 [ 11.108   1.151   1.266   7.804]
 [ -0.012   5.182   4.274   0.99 ]
 [  9.99    1.43    1.487   7.603]
 [  0.958   4.137   4.366   1.021]
 [  0.613   1.511   4.101   4.063]
 [  1.13    2.417   2.68    3.508]
 [  1.138   2.017   3.046   3.967]
 [  9.685   0.622   3.102   7.705]
 [  1.146   1.745   2.604   3.718]
 [  0.805   2.019   3.369   3.315]]

[[ 0.78504574  0.08489588  0.16127002  0.59202015]
 [ 0.19220391  0.30052743  0.64501733  0.67579106]
 [ 0.19929177  0.52543936  0.54011228  0.62647824]
 [ 0.78866512  0.06114538  0.14687499  0.59388241]
 [ 0.74463763  0.05647618  0.19350283  0.6363033 ]
 [ 0.76697782  0.01992884  0.11989575  0.63

ValueError: Negative values in data passed to NMF.fit

In [5]:
%%time
# read in data from csv
M = np.loadtxt("mat.csv", delimiter = ",")

Wall time: 1min 17s


In [6]:
%%time
# use TF-IDF to scale each document's vector to have norm 1 
M_tfidf = sklearn.feature_extraction.text.TfidfTransformer().fit_transform(M).toarray()

Wall time: 746 ms


In [7]:
%%time
fit = sklearn.decomposition.NMF(n_components=14).fit(M_tfidf)
H = fit.components_
W = fit.transform(M)

Wall time: 1min 33s




In [24]:
# classify each document into the category that fits it best
clusters = [(i,j) for i,j in enumerate(map(np.argmax, W))]
print 'Number of documents per category:', [sum([x[1]==i for x in clusters]) for i in range(14)]
print
print [(x,y) for x,y in clusters if y == 10]

Number of documents per category: [301, 94, 99, 151, 304, 57, 218, 116, 141, 35, 65, 41, 30, 52]

[(2, 10), (23, 10), (99, 10), (107, 10), (125, 10), (144, 10), (196, 10), (207, 10), (242, 10), (254, 10), (310, 10), (326, 10), (342, 10), (382, 10), (393, 10), (429, 10), (435, 10), (454, 10), (473, 10), (481, 10), (505, 10), (532, 10), (541, 10), (543, 10), (610, 10), (641, 10), (668, 10), (682, 10), (770, 10), (844, 10), (869, 10), (870, 10), (883, 10), (887, 10), (897, 10), (917, 10), (960, 10), (969, 10), (1008, 10), (1021, 10), (1042, 10), (1057, 10), (1097, 10), (1119, 10), (1139, 10), (1140, 10), (1148, 10), (1153, 10), (1163, 10), (1178, 10), (1254, 10), (1283, 10), (1286, 10), (1358, 10), (1406, 10), (1410, 10), (1428, 10), (1442, 10), (1489, 10), (1543, 10), (1582, 10), (1611, 10), (1647, 10), (1653, 10), (1677, 10)]


In [115]:
%%time
width = len(M[0])
best_words = np.zeros(14)
for i in range(14):
    w = np.ones(width) * 5000
    #w_val = np.ones(width)
    for j in range(width):
        for k in range(14):
            if i <> k and H[k][j] > 0:
                if w[j] >= float(H[i][j])/H[k][j]:
                    w[j] = float(H[i][j])/H[k][j]
    best_words[i] = np.argmax(w)
print map(int, best_words)

[407, 580, 371, 374, 915, 1053, 10, 124, 81, 483, 80, 594, 2037, 207]
Wall time: 5.62 s


In [10]:
# output the indices of the 5 most important words for each category
#print map(np.argmax, H)
num_best = 5
print map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), H)

[[12504, 4478, 11714, 1097, 11506], [9864, 580, 11520, 4206, 11309], [13365, 11700, 13170, 5498, 1283], [4503, 955, 4504, 11167, 9748], [6666, 12504, 3721, 13610, 10459], [12786, 5053, 7679, 10869, 13263], [11137, 183, 12504, 1631, 11264], [2032, 11264, 11202, 8839, 6830], [4503, 8705, 720, 6028, 13198], [10541, 4206, 4494, 13834, 1828], [12372, 7580, 8577, 612, 11224], [11214, 5098, 2172, 12622, 9520], [7304, 355, 10144, 10983, 6712], [11463, 12171, 2625, 4860, 6120]]
