In [1]:
%matplotlib inline
import numpy as np
import sklearn.decomposition
import sklearn.feature_extraction
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import bottleneck as bn

In [2]:
# arbitrary 6x3 matrix (constructed from an example at http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html with a random column concatenated)
M = np.concatenate((np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]),map(lambda x : [x], np.round(np.random.random_sample(6),1)+1)),1)
M
H = sklearn.decomposition.NMF(n_components=2).fit(M).components_
W = sklearn.decomposition.NMF(n_components=2).fit_transform(M)
print H
print
print W

[[ 3.04082109  0.50448234  0.91513758]
 [ 0.          0.95532141  0.90909083]]

[[ 0.33143314  0.92619924]
 [ 0.66168676  0.78120864]
 [ 0.97507615  0.49745405]
 [ 1.31486536  0.33994089]
 [ 1.64976882  0.08157003]
 [ 1.97286978 -0.        ]]


In [3]:
# a 20x4 example in which there are explicit types (that are unknown to the algorithm)
n = 20
types = np.trunc(np.random.random_sample(n)*3)
means = [[1,2,3,4],[1,5,4,1],[10,1,2,8]]
stddevs = 0.5 * np.ones([3,4])
def gen(t):
    mean = means[t]
    sd = stddevs[t]
    sz = len(mean)
    return list(np.random.randn(sz) * sd + mean)
# generate a data matrix
M = np.round(map(gen, map(int, types)),3)

M_tfidf = sklearn.feature_extraction.text.TfidfTransformer().fit_transform(M).toarray()
print M
print
print M_tfidf
print
print

# run NMF
fit = sklearn.decomposition.NMF(n_components=3).fit(M)
H = fit.components_
W = fit.transform(M)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)
# use fit from above to classify a new point
print fit.transform([[1,2,4,5]])

print
print

fit = sklearn.decomposition.NMF(n_components=3).fit(M_tfidf)
H = fit.components_
W = fit.transform(M_tfidf)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)

[[ -0.076   2.453   2.624   3.525]
 [  1.378   2.172   1.877   4.466]
 [  1.341   2.703   2.927   3.301]
 [  0.577   2.142   2.858   4.498]
 [  1.133   1.996   2.054   4.222]
 [ 10.546  -0.106   1.892   8.022]
 [  0.533   4.966   4.098   0.895]
 [  1.135   4.198   5.239   0.605]
 [  1.285   5.075   3.301   0.059]
 [  0.661   5.202   4.288   0.935]
 [ 10.801   0.461   2.413   7.35 ]
 [  9.816   0.594   1.737   7.998]
 [  1.542   5.036   5.683   0.576]
 [ -0.515   5.379   3.726   1.052]
 [  1.785   5.362   4.026   0.962]
 [  0.593   1.659   3.477   3.508]
 [  0.441   4.303   3.743   1.513]
 [  0.577   2.626   2.868   4.019]
 [  1.      4.923   4.458   1.311]
 [  1.274   2.745   3.452   4.477]]

[[-0.01509947  0.4873554   0.52132922  0.70033746]
 [ 0.2512326   0.39599217  0.3422087   0.81422699]
 [ 0.25089134  0.50571162  0.54762039  0.61759307]
 [ 0.09995743  0.37107248  0.49510979  0.77921757]
 [ 0.21679732  0.38193067  0.39302886  0.8078714 ]
 [ 0.7878897  -0.00791924  0.14135097  0.59

ValueError: Negative values in data passed to NMF.fit

In [4]:
%%time
# read in data from csv
noun_train_mat = np.loadtxt("noun_train_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
tf_idf_fit = sklearn.feature_extraction.text.TfidfTransformer().fit(noun_train_mat)
noun_train_mat = tf_idf_fit.transform(noun_train_mat).toarray()

Wall time: 45.2 s


In [122]:
%%time
# compute NMF fit
NMF_fit = sklearn.decomposition.NMF(n_components=14).fit(noun_train_mat)
H = NMF_fit.components_
W = NMF_fit.transform(noun_train_mat)
# contains a tuple (i,j) if document i is in cluster j, for each document
clusters = map(np.argmax, W)
# list of the documents in each cluster
cluster_lists = [[i for i,j in enumerate(clusters) if j==cluster] for cluster in range(14)]

Wall time: 30.2 s


In [42]:
# classify each document into the category that fits it best
print 'Number of documents per category:', [sum([x[1]==i for x in enumerate(clusters)]) for i in range(14)]

Number of documents per category: [110, 81, 58, 29, 167, 64, 54, 84, 84, 64, 38, 126, 115, 77]


In [43]:
# load vocab from csv
noun_vocab = np.loadtxt("noun_vocab.csv", delimiter=",", dtype="str")
noun_vocab = [(int(i),j) for i,j in noun_vocab]
id2noun = dict(noun_vocab)

In [44]:
# find the 5 most important words for each category
num_best = 5
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), H)
best_words = [[id2noun[i] for i in lst] for lst in best_indices]
best_words

[['court', 'appeal', 'district', 'jurisdiction', 'judgment'],
 ['employee', 'board', 'labor', 'union', 'relation'],
 ['commerce', 'tax', 'income', 'revenue', 'taxpayer'],
 ['decree', 'master', 'orig ', 'entry', 'boundary'],
 ['amendment', 'court', 'law', 'state', 'statute'],
 ['sentence', 'offense', 'conviction', 'guideline', 'death'],
 ['water', 'land', 'tribe', 'act', 'congres'],
 ['search', 'officer', 'warrant', 'respondent', 'police'],
 ['commission', 'carrier', 'commerce', 'gas', 'act'],
 ['property', 'bankruptcy', 'lien', 'debtor', 'bank'],
 ['child', 'school', 'board', 'student', 'children'],
 ['act', 'employee', 'respondent', 'action', 'liability'],
 ['trial', 'petitioner', 'jury', 'counsel', 'evidence'],
 ['plan', 'benefit', 'security', 'provision', 'insurance']]

In [113]:
compare_mat = map(lambda r : map(int, r), np.zeros((14,14)))
for i,j in zip(clusters, noun_train_issue_areas - 1):
    compare_mat[i][j] += 1
compare_mat = map(lambda row : map(float,row) / sum(row), np.array(compare_mat))

assignments = -1 * np.ones(14)
compare_mat_flat = [x for lst in compare_mat for x in lst]
while min(assignments) == -1:
    max_ind = np.argmax(compare_mat_flat)
    row = max_ind / 14
    if assignments[row] == -1:
        column = max_ind - 14 * row
        assignments[row] = column
    else:
        compare_mat_flat[max_ind] = -1
#5 / 2
print map(int, assignments)
print map(np.argmax, compare_mat)
print map(np.argmax, np.array(compare_mat).T)

compare_mat

[8, 6, 7, 10, 2, 0, 7, 0, 7, 7, 1, 7, 0, 7]
[8, 6, 7, 10, 2, 0, 7, 0, 7, 7, 1, 7, 0, 7]
[5, 10, 10, 9, 1, 0, 1, 8, 0, 3, 3, 2, 6, 0]


[array([ 0.2       ,  0.19090909,  0.04545455,  0.        ,  0.00909091,
         0.03636364,  0.00909091,  0.1       ,  0.39090909,  0.01818182,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.02469136,  0.01234568,  0.02469136,  0.01234568,  0.02469136,
         0.01234568,  0.61728395,  0.0617284 ,  0.11111111,  0.09876543,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.12068966,  0.06896552,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.4137931 ,  0.06896552,  0.03448276,
         0.        ,  0.27586207,  0.01724138,  0.        ]),
 array([ 0.        ,  0.10344828,  0.03448276,  0.03448276,  0.        ,
         0.        ,  0.        ,  0.10344828,  0.10344828,  0.20689655,
         0.4137931 ,  0.        ,  0.        ,  0.        ]),
 array([ 0.13173653,  0.18562874,  0.20958084,  0.04790419,  0.01796407,
         0.01796407,  0.        ,  0.08982036,  0.16766467,  0.1257485 ,
         0.        ,  

In [116]:
# can probably ignore this stuff -- it's a different way to figure out the best words in each category, but it's not working
# and produces nonsense
width = len(M[0])
best_words = np.zeros(14)
for i in range(14):
    # arbitrary high initial value
    w = np.ones(width) * 5000
    #w_val = np.ones(width)
    for j in range(width):
        for k in range(14):
            if i <> k and H[k][j] > 0:
                if w[j] >= float(H[i][j])/H[k][j]:
                    w[j] = float(H[i][j])/H[k][j]
    best_words[i] = np.argmax(w)
print map(lambda x : id2noun[int(x)], best_words)

['narcotic', 'middleman', 'middleman', 'city the', 'city the', 'middleman', 'city the', 'narcotic', 'fdca', 'city the', 'city the', 'middleman', 'city the', 'fdca']


In [123]:
%%time
# read in test data from csv
noun_test_mat = np.loadtxt("noun_test_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
noun_test_mat = tf_idf_fit.transform(noun_test_mat).toarray()

Wall time: 17.8 s


In [131]:
# use NMF fit from training data to cluster test observations
W_test = NMF_fit.transform(noun_test_mat)
clusters_test = map(np.argmax, W_test)
cluster_lists_test = [[i for i,j in enumerate(clusters_test) if j==cluster] for cluster in range(14)]

In [130]:
print cluster_lists_test

[[0, 16, 29, 37, 40, 55, 69, 75, 76, 77, 79, 81, 88, 89, 99, 105, 118, 138, 164, 168, 179, 184, 188, 189, 201, 208, 218, 233, 250, 267, 271, 275, 284, 292, 321, 325, 336, 341, 349, 355, 358, 389, 396, 407], [28, 35, 38, 109, 113, 124, 125, 152, 157, 159, 177, 195, 203, 212, 219, 240, 246, 291, 302, 303, 310, 312, 319, 327, 360, 379, 416], [5, 8, 21, 24, 34, 57, 87, 104, 121, 145, 146, 174, 200, 254, 264, 268, 269, 272, 317, 335, 426, 428, 439, 441], [15, 82, 92, 123, 181, 241, 427], [3, 14, 22, 25, 30, 32, 42, 44, 46, 52, 71, 73, 91, 100, 101, 112, 114, 116, 130, 132, 141, 150, 158, 160, 165, 167, 169, 171, 186, 187, 194, 198, 204, 210, 221, 223, 225, 227, 234, 236, 239, 251, 257, 259, 261, 280, 308, 315, 318, 320, 330, 339, 340, 342, 345, 378, 382, 385, 394, 397, 399, 406, 412, 414, 422, 423, 429, 431, 438, 440], [9, 26, 58, 78, 97, 131, 149, 178, 190, 199, 202, 209, 222, 237, 258, 260, 265, 279, 298, 304, 348, 366, 372, 386, 408, 410], [11, 49, 102, 110, 129, 139, 276, 299, 324, 332,