In [1]:
import csv
import numpy as np
from scipy import sparse
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer

## Read vocabulary file

In [2]:
def read_vocab(vocab_file_name):
    return [w.strip() for w in open(vocab_file_name)]

In [3]:
vocab = read_vocab('vocab.kos.txt')

## Read docword.txt into a document x word matrix

In [4]:
def read_docword(file_name):

    file_handle = open(file_name)
    reader = csv.reader(file_handle, delimiter=' ')
    D = int(next(reader)[0])
    W = int(next(reader)[0])
    N = int(next(reader)[0])

    #create DxW numpy matrix
    m = np.empty(shape=[D,W], dtype='int8')
    #instead of creating a sparse matrix and then fill it up, create a numpy matrix
    #and then later convert it to csr -> SparseEfficiencyWarning
    #m = sparse.csr_matrix( (D,W), dtype='int8')

    for row in reader:
        D_i = int(row[0])-1
        W_i = int(row[1])-1
        count = int(row[2])
        m[D_i, W_i] = count

    m = sparse.csr_matrix(m)

    return D,W,N,m

In [5]:
D,W,N,docword = read_docword('docword.kos.txt')

##  TF-IDF: term frequency inverse document frequency
### It's a more reliable metric than plain frequency because it normalizes frequency across documents.  Very common (and semantically meaningless) words like articles ('the', 'a', 'an' ...), prepositions, etc... are in this way given less weight and filtered out 


In [6]:
tfidf_transformer = TfidfTransformer()
docword_tfidf = tfidf_transformer.fit_transform(docword)

## Initialize K-Means object

In [7]:
k = 20
km = KMeans(algorithm='auto',
            copy_x=True,
            init='k-means++',
            max_iter=300,
            n_clusters=k,
            n_init=10,
            n_jobs=1,
            precompute_distances='auto',
            random_state=None,
            tol=0.0001,
            verbose=0)

## Compute KMeans on the TFIDF matrix

In [8]:
%time km.fit(docword_tfidf)
clusters = km.labels_.tolist()

CPU times: user 27.9 s, sys: 0 ns, total: 27.9 s
Wall time: 27.9 s


### Sort cluster centers by proximity to centroid

In [9]:
k_centers = km.cluster_centers_ #Coordinates of cluster centers
order_centroids = k_centers.argsort()[:, ::-1] #argsort returns the indices that would sort an array

### Print Cluster index and n words (closest to centroid) associated

In [11]:
for c in range(k):
    print "Cluster %i: " % c + \
            ','.join([vocab[i] for i in [ix for ix in order_centroids[c, :5]]])

Cluster 0: november,account,electoral,turnout,governor
Cluster 1: edwards,dean,clark,kerry,primary
Cluster 2: bush,kerry,campaign,general,media
Cluster 3: november,voting,vote,account,electoral
Cluster 4: herseth,district,diedrich,house,seat
Cluster 5: bush,tax,court,administration,government
Cluster 6: kerry,bush,poll,voters,general
Cluster 7: bush,administration,iraq,president,cheney
Cluster 8: november,voting,account,electoral,governor
Cluster 9: percent,voters,bush,states,state
Cluster 10: million,raised,money,campaign,bush
Cluster 11: carson,coburn,knowles,oklahoma,murkowski
Cluster 12: senate,elections,races,house,race
Cluster 13: delay,donors,ethics,morrison,house
Cluster 14: dean,deans,unions,campaign,democratic
Cluster 15: clark,dean,lieberman,gephardt,poll
Cluster 16: party,nader,ballot,democratic,republican
Cluster 17: iowa,dean,gephardt,primary,edwards
Cluster 18: iraq,war,iraqi,military,soldiers
Cluster 19: marriage,gay,amendment,musgrave,samesex
