In [162]:
import graphlab

## Load data

In [163]:
people = graphlab.SFrame('people_wiki.gl')

In [164]:
people.head()

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


## 1 Get the word count for Elton John

In [165]:
elton = people[ people['name'] == 'Elton John']

In [166]:
elton['word_count'] = graphlab.text_analytics.count_words(elton['text'])

Sort the word count

In [167]:
elton_word_count = elton[['word_count']].stack('word_count', new_column_name = ['word', 'count'])
elton_word_count = elton_word_count.sort('count', ascending=False)

In [168]:
elton_word_count.head()

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


## 2 Top TF-IDF words for Elton John

In [169]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])

In [170]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

In [171]:
people['tfidf'] = tfidf

In [172]:
elton = people[ people['name'] == 'Elton John' ]

In [173]:
elton[['tfidf']].stack('tfidf', new_column_name = ['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
overallelton,10.9864953892
tonightcandle,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## 3 The cosine distance between 'Elton John's and 'Victoria Beckham's articles (represented with TF-IDF)

In [174]:
elton = people[ people['name'] == 'Elton John' ]
victoria = people[ people['name'] == 'Victoria Beckham' ]
graphlab.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])

0.9567006376655429

## 4 The cosine distance between 'Elton John's and 'Paul McCartney's articles (represented with TF-IDF)

In [175]:
paul = people[ people['name'] == 'Paul McCartney' ]
graphlab.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])

0.8250310029221779

## 5 Who is closer to 'Elton John', 'Victoria Beckham' or 'Paul McCartney'?

Victoria Beckham

## 6 Who is the nearest neighbor to 'Elton John' using raw word counts?

Build a nearest neighbours model

In [176]:
knn_model_wordcount = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

PROGRESS: Starting brute force nearest neighbors model training.


In [177]:
knn_model_wordcount.query(elton)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00169288  | 3.649ms      |
PROGRESS: | Done         |         | 100         | 247.568ms    |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


## 7 Who is the nearest neighbor to 'Elton John' using TF-IDF?

In [178]:
knn_model_tfidf = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

PROGRESS: Starting brute force nearest neighbors model training.


In [179]:
knn_model_tfidf.query(elton)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00169288  | 5.156ms      |
PROGRESS: | Done         |         | 100         | 290.818ms    |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


## 8 Who is the nearest neighbor to 'Victoria Beckham' using raw word counts?

In [180]:
knn_model_wordcount.query(victoria)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00169288  | 4.399ms      |
PROGRESS: | Done         |         | 100         | 263.769ms    |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


## 9 Who is the nearest neighbor to 'Victoria Beckham' using TF-IDF?

In [181]:
knn_model_tfidf.query(victoria)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00169288  | 4.919ms      |
PROGRESS: | Done         |         | 100         | 288.62ms     |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
