# W6_Modeling Text Data with a Hierarchy of Clusters

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans                
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
%matplotlib inline

In [2]:
data = pd.read_csv('people_wiki.csv')
data.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    return csr_matrix( (data, indices, indptr), shape)

tf_idf_matrix = load_sparse_csr('people_wiki_tf_idf.npz')
tf_idf_matrix = normalize(tf_idf_matrix)
tf_idf_matrix # sparse matrix with 59071 data points, 547979 features

<59071x547979 sparse matrix of type '<class 'numpy.float64'>'
	with 10379283 stored elements in Compressed Sparse Row format>

In [4]:
ser = pd.read_json('people_wiki_map_index_to_word.json', typ='series')
map_index_to_word = pd.DataFrame(ser, columns=['word_index'])
map_index_to_word['word'] = map_index_to_word.index
map_index_to_word.index = map_index_to_word['word_index']
map_index_to_word.drop('word_index', axis=1, inplace=True)
map_index_to_word.head()

Unnamed: 0_level_0,word
word_index,Unnamed: 1_level_1
540315,0
536260,0
535641,0
83348,0
81527,0


In [5]:
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''
    cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster
    '''
    
    dataframe = cluster['dataframe']
    data_matrix = cluster['matrix']
    
    # run k-means on the data matrix with k=2
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1).fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # divide the data matrix into two parts using the cluster assignments
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], data_matrix[cluster_assignment==1]
    
    # divide the dataframe into two parts using the cluster assignments
    dataframe_left_child, dataframe_right_child = dataframe[cluster_assignment==0], dataframe[cluster_assignment==1]
        
    
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)

In [6]:
wiki_data = {'matrix': tf_idf_matrix, 'dataframe': data} # no centroid for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter=100, num_runs=6, seed=1)

In [7]:
def display_single_tf_idf_cluster(cluster, map_index_to_word):
    
    data_subset   = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid      = cluster['centroid']
    
    # print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    print('==============================')
    print('TOP 5 WORDS IN THE CLUSTER')
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(map_index_to_word.loc[idx[i]]['word'], centroid[idx[i]]))
    print('')
    
    # print 5 nearest neighbors wiht the title and first 180 characters of text
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    nearest_neighbors = distances.argsort()
    print('===============================')
    print('TITLE AND TEXT FOR 5 NEAREST NEIGHBORS')
    for i in range(5):
        text = ' '.join(data_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('**', data_subset.iloc[nearest_neighbors[i]]['name'])
        print(distances[nearest_neighbors[i]])
        print(text[:90], text[90:180] if len(text)>90 else '')

        #print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(data_subset[nearest_neighbors[i]]['name'],
         #     distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
    print('')

In [8]:
display_single_tf_idf_cluster(left_child, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
league:0.040
season:0.036
team:0.029
football:0.029
played:0.028

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Todd Williams
0.954683324219
todd michael williams born february 13 1971 in syracuse new york is a former major league  baseball relief pitcher he attended east syracuseminoa high school
** Gord Sherven
0.956223740771
gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas katchewan is a retired canadian professional ice hockey forward who played
** Justin Knoedler
0.956390306216
justin joseph knoedler born july 17 1980 in springfield illinois is a former major league  baseball catcherknoedler was originally drafted by the st louis cardinals
** Chris Day
0.956475380544
christopher nicholas chris day born 28 july 1975 is an english professional footballer who  plays as a goalkeeper for stevenageday started his career at tottenham
** Tony Smith (footballer, born 1957)
0.956533909709
anthony tony smith born 20 february 19

In [9]:
display_single_tf_idf_cluster(right_child, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
she:0.025
her:0.017
music:0.012
he:0.011
university:0.011

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Anita Kunz
0.974007672741
anita e kunz oc born 1956 is a canadianborn artist and illustratorkunz has lived in london  new york and toronto contributing to magazines and working
** Janet Jackson
0.974716000044
janet damita jo jackson born may 16 1966 is an american singer songwriter and actress know n for a series of sonically innovative socially conscious and
** Madonna (entertainer)
0.974747958518
madonna louise ciccone tkoni born august 16 1958 is an american singer songwriter actress  and businesswoman she achieved popularity by pushing the boundaries of lyrical
** %C3%81ine Hyland
0.975358023829
ine hyland ne donlon is emeritus professor of education and former vicepresident of univer sity college cork ireland she was born in 1942 in athboy co
** Jane Fonda
0.976212312188
jane fonda born lady jayne seymour fonda december 21 1937 is an american actress writ

In [10]:
athletes = left_child
non_athletes = right_child

In [11]:
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter=100, num_runs=6, seed=1)

In [12]:
display_single_tf_idf_cluster(left_child_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
baseball:0.111
league:0.103
major:0.051
games:0.046
season:0.045

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Steve Springer
0.893441737353
steven michael springer born february 11 1961 is an american former professional baseball  player who appeared in major league baseball as a third baseman and
** Dave Ford
0.895977080271
david alan ford born december 29 1956 is a former major league baseball pitcher for the ba ltimore orioles born in cleveland ohio ford attended lincolnwest
** Todd Williams
0.898227954339
todd michael williams born february 13 1971 in syracuse new york is a former major league  baseball relief pitcher he attended east syracuseminoa high school
** Justin Knoedler
0.900967414566
justin joseph knoedler born july 17 1980 in springfield illinois is a former major league  baseball catcherknoedler was originally drafted by the st louis cardinals
** Kevin Nicholson (baseball)
0.906073480139
kevin ronald nicholson born march 29 1976 is a canadian ba

In [13]:
display_single_tf_idf_cluster(right_child_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
season:0.034
football:0.033
team:0.031
league:0.029
played:0.027

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Gord Sherven
0.955619813753
gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas katchewan is a retired canadian professional ice hockey forward who played
** Ashley Prescott
0.956555828493
ashley prescott born 11 september 1972 is a former australian rules footballer he played w ith the richmond and fremantle football clubs in the afl between
** Chris Day
0.956560487379
christopher nicholas chris day born 28 july 1975 is an english professional footballer who  plays as a goalkeeper for stevenageday started his career at tottenham
** Jason Roberts (footballer)
0.956577195994
jason andre davis roberts mbe born 25 january 1978 is a former professional footballer and  now a football punditborn in park royal london roberts was
** Todd Curley
0.957431080848
todd curley born 14 january 1973 is a former australian rules fo

In [14]:
baseball = left_child_athletes
ice_hockey_football = right_child_athletes

In [15]:
left_child_ihs, right_child_ihs = bipartition(ice_hockey_football, maxiter=100, num_runs=6, seed=1)
display_single_tf_idf_cluster(left_child_ihs, map_index_to_word)
display_single_tf_idf_cluster(right_child_ihs, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
football:0.048
season:0.043
league:0.041
played:0.036
coach:0.034

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Todd Curley
0.945779211229
todd curley born 14 january 1973 is a former australian rules footballer who played for co llingwood and the western bulldogs in the australian football league
** Tony Smith (footballer, born 1957)
0.946064950782
anthony tony smith born 20 february 1957 is a former footballer who played as a central de fender in the football league in the 1970s and
** Chris Day
0.946229532006
christopher nicholas chris day born 28 july 1975 is an english professional footballer who  plays as a goalkeeper for stevenageday started his career at tottenham
** Ashley Prescott
0.946320666883
ashley prescott born 11 september 1972 is a former australian rules footballer he played w ith the richmond and fremantle football clubs in the afl between
** Jason Roberts (footballer)
0.946332977023
jason andre davis roberts mbe born 25 january 1978 is a form

In [16]:
left_child_non_athletes, right_child_non_athletes = bipartition(non_athletes, maxiter=100, num_runs=6, seed=1)

In [17]:
display_single_tf_idf_cluster(left_child_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
he:0.013
music:0.012
university:0.011
film:0.010
his:0.009

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Wilson McLean
0.978700581621
wilson mclean born 1937 is a scottish illustrator and artist he has illustrated primarily  in the field of advertising but has also provided cover art
** Julian Knowles
0.979381857718
julian knowles is an australian composer and performer specialising in new and emerging te chnologies his creative work spans the fields of composition for theatre dance
** James A. Joseph
0.980418925525
james a joseph born 1935 is an american former diplomatjoseph is professor of the practice  of public policy studies at duke university and founder of
** Barry Sullivan (lawyer)
0.980536900664
barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca cy at loyola university chicago school of law
** Archie Brown
0.98080902171
archibald haworth brown cmg fba commonly known as archie brown born 10 may 1938 is a briti sh 

In [18]:
display_single_tf_idf_cluster(right_child_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
she:0.126
her:0.082
film:0.013
actress:0.012
music:0.012

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Janet Jackson
0.938081949011
janet damita jo jackson born may 16 1966 is an american singer songwriter and actress know n for a series of sonically innovative socially conscious and
** Lauren Royal
0.938666907546
lauren royal born march 3 circa 1965 is a book writer from california royal has written bo th historic and novelistic booksa selfproclaimed angels baseball fan
** Barbara Hershey
0.939414848732
barbara hershey born barbara lynn herzstein february 5 1948 once known as barbara seagull  is an american actress in a career spanning nearly 50 years
** Jane Fonda
0.941018061293
jane fonda born lady jayne seymour fonda december 21 1937 is an american actress writer po litical activist former fashion model and fitness guru she is
** Alexandra Potter
0.941899827845
alexandra potter born 1970 is a british author of romantic comediesborn in bradford yorksh ire eng

In [19]:
male_non_athletes = left_child_non_athletes
female_non_athletes = right_child_non_athletes

In [20]:
left_child_male_non_athletes, right_child_male_non_athletes = bipartition(male_non_athletes, maxiter=100, num_runs=6, seed=1)

In [21]:
left_child_female_non_athletes, right_child_female_non_athletes = bipartition(female_non_athletes, maxiter=100, num_runs=6, seed=1)

In [22]:
display_single_tf_idf_cluster(left_child_male_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
university:0.017
he:0.015
law:0.013
served:0.013
research:0.013

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Barry Sullivan (lawyer)
0.970752543673
barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca cy at loyola university chicago school of law
** James A. Joseph
0.973440408591
james a joseph born 1935 is an american former diplomatjoseph is professor of the practice  of public policy studies at duke university and founder of
** David Anderson (British Columbia politician)
0.973832432984
david a anderson pc oc born august 16 1937 in victoria british columbia is a former canadi an cabinet minister educated at victoria college in victoria
** Sven Erik Holmes
0.974690686243
sven erik holmes is a former federal judge and currently the vice chairman legal risk and  regulatory and chief legal officer for kpmg llp a
** Andrew Fois
0.975579551709
andrew fois is an attorney living and working in washington dc as of april 9 2012 he

In [23]:
display_single_tf_idf_cluster(right_child_male_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
music:0.023
film:0.020
album:0.014
band:0.014
art:0.013

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Julian Knowles
0.971919901286
julian knowles is an australian composer and performer specialising in new and emerging te chnologies his creative work spans the fields of composition for theatre dance
** Peter Combe
0.972920007076
peter combe born 20 october 1948 is an australian childrens entertainer and musicianmusica l genre childrens musiche has had 22 releases including seven gold albums two
** Craig Pruess
0.973464007095
craig pruess born 1950 is an american composer musician arranger and gold platinum record  producer who has been living in britain since 1973 his career
** Ceiri Torjussen
0.974196112424
ceiri torjussen born 1976 is a composer who has contributed music to dozens of film and te levision productions in the ushis music was described by
** Wilson McLean
0.974552991101
wilson mclean born 1937 is a scottish illustrator and artist he has illustrat

In [24]:
display_single_tf_idf_cluster(left_child_female_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
she:0.121
her:0.100
actress:0.031
film:0.030
music:0.028

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Janet Jackson
0.92374494211
janet damita jo jackson born may 16 1966 is an american singer songwriter and actress know n for a series of sonically innovative socially conscious and
** Barbara Hershey
0.925242387778
barbara hershey born barbara lynn herzstein february 5 1948 once known as barbara seagull  is an american actress in a career spanning nearly 50 years
** Madonna (entertainer)
0.927528972684
madonna louise ciccone tkoni born august 16 1958 is an american singer songwriter actress  and businesswoman she achieved popularity by pushing the boundaries of lyrical
** Cher
0.929092276278
cher r born cherilyn sarkisian may 20 1946 is an american singer actress and television ho st described as embodying female autonomy in a maledominated industry
** Candice Bergen
0.932657639481
candice patricia bergen born may 9 1946 is an american actress and former fashio

In [25]:
display_single_tf_idf_cluster(right_child_female_non_athletes, map_index_to_word)

TOP 5 WORDS IN THE CLUSTER
she:0.130
her:0.072
women:0.014
miss:0.014
university:0.013

TITLE AND TEXT FOR 5 NEAREST NEIGHBORS
** Lauren Royal
0.939389752369
lauren royal born march 3 circa 1965 is a book writer from california royal has written bo th historic and novelistic booksa selfproclaimed angels baseball fan
** %C3%81ine Hyland
0.939399884114
ine hyland ne donlon is emeritus professor of education and former vicepresident of univer sity college cork ireland she was born in 1942 in athboy co
** Dorothy E. Smith
0.941132059201
dorothy edith smithborn july 6 1926 is a canadian sociologist with research interests besi des in sociology in many disciplines including womens studies psychology and educational
** Kayee Griffin
0.941620752446
kayee frances griffin born 6 february 1950 is an australian politician and former australi an labor party member of the new south wales legislative council serving
** Janine Shepherd
0.942516297403
janine lee shepherd am born 1962 is an australian p