In [1]:
import pandas as pd                                  # see below for install instruction
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans                # we'll be using scikit-learn's KMeans for this assignment
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
%matplotlib inline

In [3]:
wiki = pd.read_csv('people_wiki.csv')

In [4]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')
map_index_to_word = pd.read_json('people_wiki_map_index_to_word.json', typ='series', orient='records')

In [61]:
inv_map = {v: k for k, v in map_index_to_word.items()}

In [5]:
tf_idf = normalize(tf_idf)

In [47]:
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=-1)    
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], \
                                                      data_matrix[cluster_assignment==1]
    # Divide the dataframe into two parts, again using the cluster assignments.
    #cluster_assignment_sa = pd.Series(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment==0], \
                                                      dataframe[cluster_assignment==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    return (cluster_left_child, cluster_right_child)

In [48]:
wiki_data = {'matrix': tf_idf, 'dataframe': wiki} # no 'centroid' for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter=100, num_runs=6, seed=1)

In [62]:
def display_single_tf_idf_cluster(cluster, map_index_to_word):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''
    
    wiki_subset   = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid      = cluster['centroid']
    
    # Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(inv_map[idx[i]], centroid[idx[i]])),
    print('')
    
    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in range(8):
        text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
    print('')

In [63]:
display_single_tf_idf_cluster(left_child, map_index_to_word)
display_single_tf_idf_cluster(right_child, map_index_to_word)

league:0.040
season:0.036
team:0.029
football:0.029
played:0.028

* Todd Williams                                      0.95468
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Gord Sherven                                       0.95622
  gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas
  katchewan is a retired canadian professional ice hockey forward who played
* Justin Knoedler                                    0.95639
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Chris Day                                          0.95648
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Tony Smith (football

In [51]:
athletes = left_child
non_athletes = right_child

In [52]:
# Bipartition the cluster of athletes
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter=100, num_runs=6, seed=1)

In [64]:
display_single_tf_idf_cluster(left_child_athletes, map_index_to_word)

league:0.054
season:0.043
football:0.038
played:0.035
coach:0.030

* Tony Smith (footballer, born 1957)                 0.94677
  anthony tony smith born 20 february 1957 is a former footballer who played as a central de
  fender in the football league in the 1970s and
* Justin Knoedler                                    0.94746
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Chris Day                                          0.94849
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Todd Williams                                      0.94882
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Todd Curley                                    

In [65]:
display_single_tf_idf_cluster(right_child_athletes, map_index_to_word)

championships:0.045
tour:0.043
championship:0.035
world:0.031
won:0.031

* Alessandra Aguilar                                 0.93880
  alessandra aguilar born 1 july 1978 in lugo is a spanish longdistance runner who specialis
  es in marathon running she represented her country in the event
* Heather Samuel                                     0.93999
  heather barbara samuel born 6 july 1970 is a retired sprinter from antigua and barbuda who
   specialized in the 100 and 200 metres in 1990
* Viola Kibiwot                                      0.94037
  viola jelagat kibiwot born december 22 1983 in keiyo district is a runner from kenya who s
  pecialises in the 1500 metres kibiwot won her first
* Ayelech Worku                                      0.94052
  ayelech worku born june 12 1979 is an ethiopian longdistance runner most known for winning
   two world championships bronze medals on the 5000 metres she
* Krisztina Papp                                     0.94105
  krisztina papp 

In [66]:
display_single_tf_idf_cluster(right_child_athletes, map_index_to_word)

championships:0.045
tour:0.043
championship:0.035
world:0.031
won:0.031

* Alessandra Aguilar                                 0.93880
  alessandra aguilar born 1 july 1978 in lugo is a spanish longdistance runner who specialis
  es in marathon running she represented her country in the event
* Heather Samuel                                     0.93999
  heather barbara samuel born 6 july 1970 is a retired sprinter from antigua and barbuda who
   specialized in the 100 and 200 metres in 1990
* Viola Kibiwot                                      0.94037
  viola jelagat kibiwot born december 22 1983 in keiyo district is a runner from kenya who s
  pecialises in the 1500 metres kibiwot won her first
* Ayelech Worku                                      0.94052
  ayelech worku born june 12 1979 is an ethiopian longdistance runner most known for winning
   two world championships bronze medals on the 5000 metres she
* Krisztina Papp                                     0.94105
  krisztina papp 

In [56]:
baseball            = left_child_athletes
ice_hockey_football = right_child_athletes

In [67]:
left_child_ihs, right_child_ihs = bipartition(ice_hockey_football, maxiter=100, num_runs=6, seed=1)
display_single_tf_idf_cluster(left_child_ihs, map_index_to_word)
display_single_tf_idf_cluster(right_child_ihs, map_index_to_word)

championships:0.064
world:0.039
she:0.038
metres:0.038
olympics:0.037

* Heather Samuel                                     0.91590
  heather barbara samuel born 6 july 1970 is a retired sprinter from antigua and barbuda who
   specialized in the 100 and 200 metres in 1990
* Krisztina Papp                                     0.91672
  krisztina papp born 17 december 1982 in eger is a hungarian long distance runner she is th
  e national indoor record holder over 5000 mpapp began
* Ayelech Worku                                      0.91892
  ayelech worku born june 12 1979 is an ethiopian longdistance runner most known for winning
   two world championships bronze medals on the 5000 metres she
* Viola Kibiwot                                      0.91906
  viola jelagat kibiwot born december 22 1983 in keiyo district is a runner from kenya who s
  pecialises in the 1500 metres kibiwot won her first
* Alessandra Aguilar                                 0.91955
  alessandra aguilar born 1 j

In [68]:
# Bipartition the cluster of non-athletes
left_child_non_athletes, right_child_non_athletes = bipartition(non_athletes, maxiter=100, num_runs=6, seed=1)

display_single_tf_idf_cluster(left_child_non_athletes, map_index_to_word)
display_single_tf_idf_cluster(right_child_non_athletes, map_index_to_word)

university:0.016
he:0.013
she:0.013
law:0.012
served:0.012

* Barry Sullivan (lawyer)                            0.97227
  barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca
  cy at loyola university chicago school of law
* Kayee Griffin                                      0.97444
  kayee frances griffin born 6 february 1950 is an australian politician and former australi
  an labor party member of the new south wales legislative council serving
* Christine Robertson                                0.97450
  christine mary robertson born 5 october 1948 is an australian politician and former austra
  lian labor party member of the new south wales legislative council serving
* James A. Joseph                                    0.97464
  james a joseph born 1935 is an american former diplomatjoseph is professor of the practice
   of public policy studies at duke university and founder of
* David Anderson (British Columbia politician)       0.97492
 

In [70]:
male_non_athletes = left_child_non_athletes
female_non_athletes = right_child_non_athletes

In [72]:
left_child_male, right_child_male = bipartition(male_non_athletes, maxiter=100, num_runs=6, seed=1)

display_single_tf_idf_cluster(left_child_male, map_index_to_word)
display_single_tf_idf_cluster(right_child_male, map_index_to_word)

party:0.039
election:0.036
minister:0.033
she:0.028
elected:0.026

* Kayee Griffin                                      0.95170
  kayee frances griffin born 6 february 1950 is an australian politician and former australi
  an labor party member of the new south wales legislative council serving
* Marcelle Mersereau                                 0.95417
  marcelle mersereau born february 14 1942 in pointeverte new brunswick is a canadian politi
  cian a civil servant for most of her career she also served
* Lucienne Robillard                                 0.95453
  lucienne robillard pc born june 16 1945 is a canadian politician and a member of the liber
  al party of canada she sat in the house
* Maureen Lyster                                     0.95590
  maureen anne lyster born 10 september 1943 is an australian politician she was an australi
  an labor party member of the victorian legislative assembly from 1985
* Liz Cunningham                                     0.95690
  eli

In [73]:
left_child_female, right_child_female = bipartition(female_non_athletes, maxiter=100, num_runs=6, seed=1)

display_single_tf_idf_cluster(left_child_female, map_index_to_word)
display_single_tf_idf_cluster(right_child_female, map_index_to_word)

music:0.027
film:0.023
album:0.017
band:0.016
art:0.015

* Julian Knowles                                     0.96904
  julian knowles is an australian composer and performer specialising in new and emerging te
  chnologies his creative work spans the fields of composition for theatre dance
* Peter Combe                                        0.97080
  peter combe born 20 october 1948 is an australian childrens entertainer and musicianmusica
  l genre childrens musiche has had 22 releases including seven gold albums two
* Craig Pruess                                       0.97121
  craig pruess born 1950 is an american composer musician arranger and gold platinum record 
  producer who has been living in britain since 1973 his career
* Ceiri Torjussen                                    0.97169
  ceiri torjussen born 1976 is a composer who has contributed music to dozens of film and te
  levision productions in the ushis music was described by
* Brenton Broadstock                       