In [1]:
%matplotlib inline
import numpy as np
import sklearn.decomposition
import sklearn.feature_extraction
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import bottleneck as bn

###Fake Testing

In [2]:
# arbitrary 6x3 matrix (constructed from an example at http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html with a random column concatenated)
M = np.concatenate((np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]),map(lambda x : [x], np.round(np.random.random_sample(6),1)+1)),1)
M
H = sklearn.decomposition.NMF(n_components=2).fit(M).components_
W = sklearn.decomposition.NMF(n_components=2).fit_transform(M)
print H
print
print W

[[ 3.07519602  0.44741457  0.77929221]
 [ 0.          0.96806042  1.1249227 ]]

[[ 0.328143    1.01084916]
 [ 0.64770545  0.61618696]
 [ 0.97154435  0.61351523]
 [ 1.30079584  0.43170723]
 [ 1.62395529 -0.        ]
 [ 1.95505003  0.29763123]]


In [3]:
# a 20x4 example in which there are explicit types (that are unknown to the algorithm)
n = 20
types = np.trunc(np.random.random_sample(n)*3)
means = [[1,2,3,4],[1,5,4,1],[10,1,2,8]]
stddevs = 0.5 * np.ones([3,4])
def gen(t):
    mean = means[t]
    sd = stddevs[t]
    sz = len(mean)
    return list(np.random.randn(sz) * sd + mean)
# generate a data matrix
M = np.round(map(gen, map(int, types)),3)

M_tfidf = sklearn.feature_extraction.text.TfidfTransformer().fit_transform(M).toarray()
print M
print
print M_tfidf
print
print

# run NMF
fit = sklearn.decomposition.NMF(n_components=3).fit(M)
H = fit.components_
W = fit.transform(M)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)
# use fit from above to classify a new point
print fit.transform([[1,2,4,5]])

print
print

fit = sklearn.decomposition.NMF(n_components=3).fit(M_tfidf)
H = fit.components_
W = fit.transform(M_tfidf)
#print H
results = list(np.concatenate((10*W,map(lambda x:[x],types)), 1))
results.sort(key = lambda row : row[3])
results = np.array(map(lambda row : list(np.round(row,3)), results))
print results
#print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), results)
# predicted clusters
print map(lambda row : np.argmax(np.round(list(row[0:3]),3)), W)
# true clusters
print map(int, types)

[[  0.072   4.956   4.011   0.802]
 [  9.865   0.675   2.673   8.355]
 [ 10.05    0.036   2.364   8.224]
 [  0.35    1.633   2.786   3.934]
 [  0.723   4.566   4.447   1.472]
 [  9.926   0.302   1.444   7.64 ]
 [  0.912   5.126   3.681   0.96 ]
 [  1.92    4.893   3.625   1.508]
 [  1.621   5.348   4.19    1.373]
 [  9.765   1.166   1.95    7.319]
 [  0.592   1.612   2.369   4.295]
 [  0.587   2.06    2.634   3.073]
 [  0.784   3.238   3.197   4.753]
 [  0.74    1.644   3.028   3.965]
 [ 10.597   1.569   2.334   8.535]
 [  1.112   2.592   3.101   3.911]
 [  1.872   1.559   2.914   3.758]
 [  0.638   5.422   3.827   0.295]
 [ 10.114   0.53    1.249   7.767]
 [  0.774   3.164   3.612   4.013]]

[[ 0.01120381  0.77119543  0.62414545  0.12479797]
 [ 0.74631094  0.05106537  0.20221887  0.63207581]
 [ 0.7613932   0.00272738  0.17909786  0.6230545 ]
 [ 0.06860459  0.32008939  0.5460925   0.77111554]
 [ 0.1098565   0.69378255  0.67570106  0.22366358]
 [ 0.78700628  0.02394478  0.11449094  0.60

###Fitting Model with Training Data

In [4]:
%%time
# read in data from csv
noun_train_mat = np.loadtxt("noun_train_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
tf_idf_fit = sklearn.feature_extraction.text.TfidfTransformer().fit(noun_train_mat)
noun_train_mat = tf_idf_fit.transform(noun_train_mat).toarray()

Wall time: 49.6 s


In [5]:
%%time
# compute NMF fit
NMF_fit = sklearn.decomposition.NMF(n_components=14, init='nndsvda').fit(noun_train_mat)
H = NMF_fit.components_
W = NMF_fit.transform(noun_train_mat)
# contains a tuple (i,j) if document i is in cluster j, for each document
clusters = map(np.argmax, W)
# list of the documents in each cluster
cluster_lists = [[i for i,j in enumerate(clusters) if j==cluster] for cluster in range(14)]

Wall time: 1min 52s




In [6]:
# classify each document into the category that fits it best
print 'Number of documents per category:', [sum([x==i for x in clusters]) for i in range(14)]

Number of documents per category: [100, 185, 74, 59, 68, 145, 44, 68, 88, 53, 35, 13, 26, 152]


###Key Words for Each Cluster

In [7]:
# load vocab from csv
noun_vocab = np.loadtxt("noun_vocab.csv", delimiter=",", dtype="str")
noun_vocab = [(int(i),j) for i,j in noun_vocab]
id2noun = dict(noun_vocab)

In [8]:
# find the 5 most important words for each category
num_best = 5
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), H)
best_words = [[id2noun[i] for i in lst] for lst in best_indices]
best_words

[['court', 'district', 'petitioner', 'appeal', 'habea'],
 ['act', 'respondent', 'action', 'court', 'title'],
 ['labor', 'union', 'board', 'employee', 'employer'],
 ['warrant', 'police', 'search', 'officer', 'petitioner'],
 ['property', 'tax', 'revenue', 'income', 'busines'],
 ['jury', 'sentence', 'defendant', 'trial', 'offense'],
 ['carrier', 'railroad', 'icc', 'rate', 'commerce'],
 ['attorney', 'alien', 'general', 'brief', 'cause'],
 ['commission', 'price', 'act', 'sale', 'company'],
 ['contract', 'arbitration', 'agreement', 'government', 'contractor'],
 ['plan', 'board', 'school', 'student', 'education'],
 ['patent', 'invention', 'art', 'claim', 'royalty'],
 ['master', 'decree', 'orig ', 'entry', 'boundary'],
 ['state', 'court', 'statute', 'law', 'amendment']]

In [9]:
# can probably ignore this stuff -- it's a different way to figure out the best words in each category, but it's not working
# and produces nonsense
width = len(M[0])
best_words = np.zeros(14)
for i in range(14):
    # arbitrary high initial value
    w = np.ones(width) * 5000
    #w_val = np.ones(width)
    for j in range(width):
        for k in range(14):
            if i <> k and H[k][j] > 0:
                if w[j] >= float(H[i][j])/H[k][j]:
                    w[j] = float(H[i][j])/H[k][j]
    best_words[i] = np.argmax(w)
print map(lambda x : id2noun[int(x)], best_words)

['narcotic', 'fdca', 'middleman', 'narcotic', 'city the', 'middleman', 'city the', 'middleman', 'fdca', 'narcotic', 'city the', 'fdca', 'city the', 'city the']


###Comparing with Supreme Court Database's Topic Areas

In [10]:
# read in SC Database's issue areas from csv
noun_train_issue_areas = np.loadtxt("noun_train_issue_areas.csv", delimiter = ",", dtype="int")
# zero-index the array
noun_train_issue_areas = noun_train_issue_areas - 1
print 'Number of documents per category:', [sum([x==i for x in noun_train_issue_areas]) for i in range(14)]

noun_train_issue_areas_dummy = np.array(map(lambda area : np.eye(1,14,area)[0], noun_train_issue_areas))

Number of documents per category: [238, 188, 96, 36, 10, 8, 64, 242, 131, 56, 10, 30, 1, 0]


In [13]:
compare_mat = map(lambda r : map(int, r), np.zeros((14,14)))
for i,j in zip(clusters, noun_train_issue_areas):
    compare_mat[i][j] += 1
compare_mat = map(lambda row : map(float,row) / sum(row), np.array(compare_mat))

assignments = -1 * np.ones(14)
compare_mat_flat = [x for lst in compare_mat for x in lst]
while min(assignments) == -1:
    max_ind = np.argmax(compare_mat_flat)
    row = max_ind / 14
    if assignments[row] == -1:
        column = max_ind - 14 * row
        assignments[row] = column
    else:
        compare_mat_flat[max_ind] = -1
#5 / 2
assignments = map(int, assignments)
print assignments
print map(np.argmax, compare_mat)
print map(np.argmax, np.array(compare_mat).T)

compare_mat

[8, 7, 6, 0, 7, 0, 7, 1, 7, 7, 1, 7, 10, 2]
[8, 7, 6, 0, 7, 0, 7, 1, 7, 7, 1, 7, 10, 2]
[3, 10, 13, 13, 13, 7, 2, 11, 0, 12, 12, 4, 4, 0]


[array([ 0.27,  0.17,  0.07,  0.03,  0.  ,  0.  ,  0.01,  0.06,  0.35,
         0.03,  0.  ,  0.01,  0.  ,  0.  ]),
 array([ 0.03783784,  0.27567568,  0.05945946,  0.04324324,  0.01621622,
         0.        ,  0.01621622,  0.36216216,  0.15675676,  0.02702703,
         0.        ,  0.00540541,  0.        ,  0.        ]),
 array([ 0.01351351,  0.01351351,  0.10810811,  0.01351351,  0.01351351,
         0.        ,  0.60810811,  0.04054054,  0.08108108,  0.10810811,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.89830508,  0.05084746,  0.        ,  0.01694915,  0.01694915,
         0.        ,  0.        ,  0.        ,  0.01694915,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.13235294,  0.04411765,  0.        ,  0.        ,  0.        ,
         0.01470588,  0.        ,  0.38235294,  0.04411765,  0.02941176,
         0.        ,  0.33823529,  0.01470588,  0.        ]),
 array([ 0.73103448,  0.12413793,  0.03448276,  0.03

In [12]:
### NOTE: probably shouldn't run this, since it has very low accuracy
# use Hungarian algorithm (computing assignments with cost minimization) to produce optimal assignments. 
import munkres
m = munkres.Munkres()
assignments = dict(m.compute(-np.array(compare_mat)))
print assignments

{0: 8, 1: 4, 2: 6, 3: 0, 4: 11, 5: 13, 6: 7, 7: 5, 8: 12, 9: 3, 10: 1, 11: 9, 12: 10, 13: 2}


In [14]:
new_clusters = map(lambda cluster : assignments[cluster], clusters)
correct = map(lambda (c1,c2) : c1==c2, zip(new_clusters, noun_train_issue_areas))
print float(sum(correct)) / len(correct)

0.490990990991


###Applying Model to Test Data

In [15]:
%%time
# read in test data from csv
noun_test_mat = np.loadtxt("noun_test_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
noun_test_mat = tf_idf_fit.transform(noun_test_mat).toarray()

Wall time: 17.8 s


In [16]:
# use NMF fit from training data to cluster test observations
W_test = NMF_fit.transform(noun_test_mat)
clusters_test = map(np.argmax, W_test)
cluster_lists_test = [[i for i,j in enumerate(clusters_test) if j==cluster] for cluster in range(14)]

In [17]:
print cluster_lists_test

[[21, 32, 40, 41, 47, 68, 76, 88, 91, 97, 98, 100, 101, 102, 107, 110, 130, 175, 181, 205, 221, 224, 225, 226, 227, 254, 267, 270, 271, 273, 288, 289, 312, 321, 324, 327, 335, 340, 364, 369, 381, 383, 388, 391, 404, 423, 446, 457, 459], [3, 10, 13, 24, 30, 43, 44, 48, 50, 75, 77, 80, 92, 104, 108, 112, 116, 135, 136, 139, 145, 152, 154, 165, 169, 179, 199, 210, 233, 239, 242, 245, 246, 247, 264, 266, 275, 276, 279, 295, 301, 305, 314, 315, 318, 320, 334, 337, 338, 343, 344, 345, 360, 371, 372, 374, 378, 386, 387, 390, 394, 401, 403, 408, 409, 428, 431, 451, 458, 465, 470, 471, 473, 480, 481, 482, 484], [12, 19, 26, 59, 62, 64, 103, 120, 144, 149, 180, 182, 193, 194, 240, 265, 272, 291, 304, 307, 309, 313, 319, 380, 389, 454, 460, 461, 467, 476], [22, 36, 54, 122, 132, 160, 162, 219, 228, 241, 244, 310, 347, 356, 368, 405, 407, 412, 424, 432, 453, 472], [6, 11, 23, 25, 33, 35, 38, 58, 83, 114, 115, 118, 119, 126, 129, 147, 148, 150, 166, 191, 197, 209, 211, 232, 252, 253, 269, 285, 290,

###Compare Test Data Results to SC Database Categories

In [18]:
# read in SC Database's issue areas from csv
noun_test_issue_areas = np.loadtxt("noun_test_issue_areas.csv", delimiter = ",", dtype="int")
# zero-index the array
noun_test_issue_areas = noun_test_issue_areas - 1
print 'Number of documents per category:', [sum([x==i for x in noun_test_issue_areas]) for i in range(14)]

Number of documents per category: [104, 95, 26, 22, 3, 7, 18, 96, 63, 25, 5, 17, 4, 0]


In [19]:
new_clusters_test = map(lambda cluster : assignments[cluster], clusters_test)
correct_test = map(lambda (c1,c2) : c1==c2, zip(new_clusters_test, noun_test_issue_areas))
print float(sum(correct_test)) / len(correct_test)

0.455670103093
