In [1]:
%matplotlib inline
import numpy as np
import sklearn.decomposition
import sklearn.feature_extraction
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import bottleneck as bn

In [2]:
# arbitrary 6x3 matrix (constructed from an example at http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html with a random column concatenated)
M = np.concatenate((np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]),map(lambda x : [x], np.round(np.random.random_sample(6),1)+1)),1)
M
H = sklearn.decomposition.NMF(n_components=2).fit(M).components_
W = sklearn.decomposition.NMF(n_components=2).fit_transform(M)
print H
print
print W

[[  3.09686070e+00   4.75751791e-01   6.94660147e-01]
 [  1.94409500e-06   8.45813280e-01   1.44194643e+00]]

[[ 0.32279775  1.01766758]
 [ 0.64653046  0.7001715 ]
 [ 0.96883113  0.85673611]
 [ 1.29088327  0.58274922]
 [ 1.61418045  0.10173439]
 [ 1.93796563 -0.        ]]


In [3]:
# a 20x4 example in which there are explicit types (that are unknown to the algorithm)
n = 20
types = np.trunc(np.random.random_sample(n)*3)
means = [[1,2,3,4],[1,5,4,1],[10,1,2,8]]
stddevs = 0.5 * np.ones([3,4])
def gen(t):
    mean = means[t]
    sd = stddevs[t]
    sz = len(mean)
    return list(np.random.randn(sz) * sd + mean)
# generate a data matrix
M = np.round(map(gen, map(int, types)),3)

M_tfidf = sklearn.feature_extraction.text.TfidfTransformer().fit_transform(M).toarray()
print M
print
print M_tfidf
print
print

# run NMF
fit = sklearn.decomposition.NMF(n_components=3).fit(M)
H = fit.components_
W = fit.transform(M)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)
# use fit from above to classify a new point
print fit.transform([[1,2,4,5]])

print
print

fit = sklearn.decomposition.NMF(n_components=3).fit(M_tfidf)
H = fit.components_
W = fit.transform(M_tfidf)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)

[[  9.421   0.407   2.343   7.722]
 [  8.856   0.957   2.105   7.88 ]
 [  0.593   2.335   3.139   3.23 ]
 [ 10.032   0.466   1.51    8.066]
 [  1.455   4.472   3.368   1.036]
 [  0.547   1.294   2.509   4.802]
 [  0.278   2.185   2.713   3.662]
 [  9.516   0.362   1.385   8.91 ]
 [ 10.439   0.974   2.46    7.899]
 [  0.955   1.937   2.043   3.288]
 [  9.447   0.332   1.315   8.366]
 [ 10.663   0.817   2.296   7.643]
 [  0.505   1.58    4.762   3.843]
 [  1.041   5.203   3.839   1.218]
 [  0.843   4.794   4.028   0.647]
 [ 10.037   0.591   1.634   8.088]
 [  1.174   2.9     2.486   4.455]
 [ 10.132   0.787   1.935   7.821]
 [  1.944   4.886   3.992   1.132]
 [  0.766   3.906   3.798   1.102]]

[[ 0.75906768  0.03279275  0.18877991  0.62217605]
 [ 0.73325427  0.07923717  0.17428865  0.65244395]
 [ 0.11609589  0.45713982  0.61454471  0.63236044]
 [ 0.77352845  0.03593144  0.11643022  0.62193784]
 [ 0.24759895  0.76100516  0.57313627  0.17629726]
 [ 0.09772855  0.23118967  0.44826498  0.85

In [4]:
%%time
# read in data from csv
noun_train_mat = np.loadtxt("noun_train_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
tf_idf_fit = sklearn.feature_extraction.text.TfidfTransformer().fit(noun_train_mat)
noun_train_mat = tf_idf_fit.transform(noun_train_mat).toarray()

Wall time: 41.8 s


In [5]:
%%time
# compute NMF fit
NMF_fit = sklearn.decomposition.NMF(n_components=14).fit(noun_train_mat)
H = NMF_fit.components_
W = NMF_fit.transform(noun_train_mat)
# contains a tuple (i,j) if document i is in cluster j, for each document
clusters = map(np.argmax, W)
# list of the documents in each cluster
cluster_lists = [[i for i,j in enumerate(clusters) if j==cluster] for cluster in range(14)]

Wall time: 1min 20s




In [6]:
# classify each document into the category that fits it best
print 'Number of documents per category:', [sum([x[1]==i for x in enumerate(clusters)]) for i in range(14)]

Number of documents per category: [97, 185, 73, 59, 68, 146, 41, 68, 88, 55, 35, 12, 26, 157]


In [7]:
# load vocab from csv
noun_vocab = np.loadtxt("noun_vocab.csv", delimiter=",", dtype="str")
noun_vocab = [(int(i),j) for i,j in noun_vocab]
id2noun = dict(noun_vocab)

In [8]:
# find the 5 most important words for each category
num_best = 5
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), H)
best_words = [[id2noun[i] for i in lst] for lst in best_indices]
best_words

[['appeal', 'court', 'district', 'petitioner', 'habea'],
 ['action', 'act', 'respondent', 'court', 'title'],
 ['union', 'labor', 'employee', 'board', 'employer'],
 ['search', 'warrant', 'police', 'officer', 'petitioner'],
 ['tax', 'income', 'revenue', 'property', 'busines'],
 ['jury', 'trial', 'sentence', 'defendant', 'offense'],
 ['carrier', 'railroad', 'icc', 'rate', 'commerce'],
 ['attorney', 'alien', 'brief', 'general', 'cause'],
 ['commission', 'act', 'price', 'sale', 'company'],
 ['contract', 'government', 'arbitration', 'agreement', 'contractor'],
 ['student', 'school', 'plan', 'board', 'education'],
 ['patent', 'art', 'invention', 'claim', 'royalty'],
 ['decree', 'orig ', 'master', 'entry', 'boundary'],
 ['court', 'law', 'statute', 'state', 'amendment']]

In [13]:
# read in SC Database's issue areas from csv
noun_train_issue_areas = np.loadtxt("noun_train_issue_areas.csv", delimiter = ",", dtype="int")

In [14]:
compare_mat = map(lambda r : map(int, r), np.zeros((14,14)))
for i,j in zip(clusters, noun_train_issue_areas - 1):
    compare_mat[i][j] += 1
compare_mat = map(lambda row : map(float,row) / sum(row), np.array(compare_mat))

assignments = -1 * np.ones(14)
compare_mat_flat = [x for lst in compare_mat for x in lst]
while min(assignments) == -1:
    max_ind = np.argmax(compare_mat_flat)
    row = max_ind / 14
    if assignments[row] == -1:
        column = max_ind - 14 * row
        assignments[row] = column
    else:
        compare_mat_flat[max_ind] = -1
#5 / 2
print map(int, assignments)
print map(np.argmax, compare_mat)
print map(np.argmax, np.array(compare_mat).T)

compare_mat

[8, 7, 6, 0, 7, 0, 7, 1, 7, 7, 1, 7, 10, 2]
[8, 7, 6, 0, 7, 0, 7, 1, 7, 7, 1, 7, 10, 2]
[3, 10, 13, 13, 13, 7, 2, 11, 0, 12, 12, 4, 4, 0]


[array([ 0.26804124,  0.16494845,  0.06185567,  0.03092784,  0.        ,
         0.        ,  0.01030928,  0.06185567,  0.36082474,  0.03092784,
         0.        ,  0.01030928,  0.        ,  0.        ]),
 array([ 0.03783784,  0.27567568,  0.05405405,  0.04324324,  0.01621622,
         0.        ,  0.01621622,  0.36756757,  0.15675676,  0.02702703,
         0.        ,  0.00540541,  0.        ,  0.        ]),
 array([ 0.01369863,  0.01369863,  0.09589041,  0.01369863,  0.01369863,
         0.        ,  0.61643836,  0.04109589,  0.08219178,  0.10958904,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.89830508,  0.05084746,  0.        ,  0.01694915,  0.01694915,
         0.        ,  0.        ,  0.        ,  0.01694915,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.13235294,  0.04411765,  0.        ,  0.        ,  0.        ,
         0.01470588,  0.        ,  0.38235294,  0.04411765,  0.02941176,
         0.        ,  

In [15]:
# can probably ignore this stuff -- it's a different way to figure out the best words in each category, but it's not working
# and produces nonsense
width = len(M[0])
best_words = np.zeros(14)
for i in range(14):
    # arbitrary high initial value
    w = np.ones(width) * 5000
    #w_val = np.ones(width)
    for j in range(width):
        for k in range(14):
            if i <> k and H[k][j] > 0:
                if w[j] >= float(H[i][j])/H[k][j]:
                    w[j] = float(H[i][j])/H[k][j]
    best_words[i] = np.argmax(w)
print map(lambda x : id2noun[int(x)], best_words)

['narcotic', 'fdca', 'middleman', 'narcotic', 'city the', 'middleman', 'city the', 'middleman', 'fdca', 'narcotic', 'city the', 'fdca', 'city the', 'city the']


In [16]:
%%time
# read in test data from csv
noun_test_mat = np.loadtxt("noun_test_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
noun_test_mat = tf_idf_fit.transform(noun_test_mat).toarray()

Wall time: 12.4 s


In [17]:
# use NMF fit from training data to cluster test observations
W_test = NMF_fit.transform(noun_test_mat)
clusters_test = map(np.argmax, W_test)
cluster_lists_test = [[i for i,j in enumerate(clusters_test) if j==cluster] for cluster in range(14)]

In [18]:
print cluster_lists_test

[[21, 32, 40, 41, 68, 76, 88, 91, 97, 98, 100, 101, 102, 107, 110, 130, 175, 205, 221, 224, 225, 226, 227, 254, 270, 271, 273, 288, 289, 312, 321, 324, 327, 335, 340, 364, 369, 381, 383, 391, 404, 423, 446, 457, 459], [3, 10, 13, 24, 30, 43, 48, 50, 75, 77, 80, 92, 104, 108, 112, 116, 135, 136, 139, 145, 152, 154, 165, 169, 179, 210, 233, 239, 242, 245, 246, 247, 264, 266, 267, 275, 276, 279, 295, 301, 305, 314, 315, 318, 320, 334, 337, 338, 343, 344, 345, 360, 371, 372, 374, 375, 378, 386, 387, 390, 394, 401, 403, 408, 409, 428, 431, 451, 458, 465, 470, 471, 473, 480, 481, 482, 484], [12, 19, 26, 59, 62, 64, 103, 120, 144, 149, 180, 182, 193, 194, 199, 240, 265, 272, 291, 304, 307, 309, 313, 319, 380, 389, 454, 460, 461, 467, 476], [22, 36, 54, 122, 132, 160, 162, 219, 228, 241, 244, 310, 347, 356, 368, 405, 407, 412, 424, 432, 453, 472], [6, 11, 23, 25, 33, 35, 38, 58, 83, 114, 115, 118, 119, 126, 129, 147, 148, 150, 166, 191, 197, 209, 211, 232, 252, 253, 269, 285, 290, 303, 339, 37