In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
%matplotlib inline

In [2]:
wiki=pd.read_csv('people_wiki.csv')

In [3]:
def load_sparse_csr(filename):
    loader=np.load(filename)
    data=loader['data']
    indices=loader['indices']
    indptr=loader['indptr']
    shape=loader['shape']
    return csr_matrix( (data, indices, indptr), shape)

tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')

In [12]:
map_index=pd.read_json('people_wiki_map_index_to_word.json',typ='series')

In [14]:
tf_idf=normalize(tf_idf)

In [20]:
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1)    
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], \
                                                      data_matrix[cluster_assignment==1]
    
    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = np.array(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment_sa==0], \
                                                      dataframe[cluster_assignment_sa==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)

In [21]:
wiki_data = {'matrix': tf_idf, 'dataframe': wiki} # no 'centroid' for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter=100, num_runs=6, seed=1)
print (left_child)
print (right_child)

{'matrix': <11510x547979 sparse matrix of type '<class 'numpy.float64'>'
	with 1885831 stored elements in Compressed Sparse Row format>, 'dataframe':                                                      URI  \
0            <http://dbpedia.org/resource/Digby_Morrell>   
17     <http://dbpedia.org/resource/Paddy_Dunne_(Gael...   
21           <http://dbpedia.org/resource/Ceiron_Thomas>   
22            <http://dbpedia.org/resource/Adel_Sellimi>   
25             <http://dbpedia.org/resource/Vic_Stasiuk>   
28            <http://dbpedia.org/resource/Leon_Hapgood>   
30               <http://dbpedia.org/resource/Dom_Flora>   
33               <http://dbpedia.org/resource/Bob_Reece>   
41     <http://dbpedia.org/resource/Bob_Adams_(Americ...   
48              <http://dbpedia.org/resource/Marc_Logan>   
49          <http://dbpedia.org/resource/Corey_Woolfolk>   
63              <http://dbpedia.org/resource/Alan_Roper>   
75      <http://dbpedia.org/resource/Vladimir_Yurchenko>   
78        

In [36]:
def display_single_tf_idf_cluster(cluster, map_index_to_word):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''
    
    wiki_subset   = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid      = cluster['centroid']
    
    # Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(map_index.index[idx[i]], centroid[idx[i]])),
    print('')
    
    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in range(8):
        text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
    print('')

In [37]:
display_single_tf_idf_cluster(left_child, map_index)
display_single_tf_idf_cluster(right_child, map_index)

19771992according:0.040
sibinki:0.036
gonino:0.029
anchoragearea:0.029
ngandu:0.028

* Todd Williams                                      0.95468
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Gord Sherven                                       0.95622
  gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas
  katchewan is a retired canadian professional ice hockey forward who played
* Justin Knoedler                                    0.95639
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Chris Day                                          0.95648
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* T

In [38]:
athletes = left_child
non_athletes = right_child
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter=100, num_runs=6, seed=1)

In [39]:
display_single_tf_idf_cluster(left_child_athletes, map_index)

qc:0.111
19771992according:0.103
aulas:0.051
guitarscordray:0.046
sibinki:0.045

* Steve Springer                                     0.89344
  steven michael springer born february 11 1961 is an american former professional baseball 
  player who appeared in major league baseball as a third baseman and
* Dave Ford                                          0.89598
  david alan ford born december 29 1956 is a former major league baseball pitcher for the ba
  ltimore orioles born in cleveland ohio ford attended lincolnwest
* Todd Williams                                      0.89823
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Justin Knoedler                                    0.90097
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Kevin Nicholson (ba

In [40]:
display_single_tf_idf_cluster(right_child_athletes, map_index)

sibinki:0.034
anchoragearea:0.033
gonino:0.031
19771992according:0.029
ngandu:0.027

* Gord Sherven                                       0.95562
  gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas
  katchewan is a retired canadian professional ice hockey forward who played
* Ashley Prescott                                    0.95656
  ashley prescott born 11 september 1972 is a former australian rules footballer he played w
  ith the richmond and fremantle football clubs in the afl between
* Chris Day                                          0.95656
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Jason Roberts (footballer)                         0.95658
  jason andre davis roberts mbe born 25 january 1978 is a former professional footballer and
   now a football punditborn in park royal london roberts was
* Todd Curley      

In [41]:
baseball            = left_child_athletes
ice_hockey_football = right_child_athletes

In [43]:
left_child_ihs, right_child_ihs = bipartition(ice_hockey_football, maxiter=100, num_runs=6, seed=1)
display_single_tf_idf_cluster(left_child_ihs, map_index)
display_single_tf_idf_cluster(right_child_ihs, map_index)

anchoragearea:0.048
sibinki:0.043
19771992according:0.041
ngandu:0.036
ssls:0.034

* Todd Curley                                        0.94578
  todd curley born 14 january 1973 is a former australian rules footballer who played for co
  llingwood and the western bulldogs in the australian football league
* Tony Smith (footballer, born 1957)                 0.94606
  anthony tony smith born 20 february 1957 is a former footballer who played as a central de
  fender in the football league in the 1970s and
* Chris Day                                          0.94623
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Ashley Prescott                                    0.94632
  ashley prescott born 11 september 1972 is a former australian rules footballer he played w
  ith the richmond and fremantle football clubs in the afl between
* Jason Roberts (footballer)            

In [44]:
left_child_non_athletes, right_child_non_athletes = bipartition(non_athletes, maxiter=100, num_runs=6, seed=1)

display_single_tf_idf_cluster(left_child_non_athletes, map_index)
display_single_tf_idf_cluster(right_child_non_athletes, map_index)

gan:0.013
33story:0.012
efovi:0.011
allmvfc:0.010
ipfw:0.009

* Wilson McLean                                      0.97870
  wilson mclean born 1937 is a scottish illustrator and artist he has illustrated primarily 
  in the field of advertising but has also provided cover art
* Julian Knowles                                     0.97938
  julian knowles is an australian composer and performer specialising in new and emerging te
  chnologies his creative work spans the fields of composition for theatre dance
* James A. Joseph                                    0.98042
  james a joseph born 1935 is an american former diplomatjoseph is professor of the practice
   of public policy studies at duke university and founder of
* Barry Sullivan (lawyer)                            0.98054
  barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca
  cy at loyola university chicago school of law
* Archie Brown                                       0.98081
  archib

In [45]:
male_non_athletes = left_child_non_athletes
female_non_athletes = right_child_non_athletes