In [1]:
## This is solution to " Clustering and Similarity: Retrieving Wikipedia Articles"

In [2]:
import graphlab




In [3]:
people = graphlab.SFrame('people_wiki.gl/')







In [4]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'since': 1L, 'carltons': 1L, 'being': 1L, '2005': ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1L, 'thomas': 1L, 'closely': 1L, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1L, 'issued': 1L, 'mainly': 1L, ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'all': 1L, 'bauforschung': 1L, ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'legendary': 1L, 'gangstergenka': 1L, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'now': 1L, 'currently': 1L, 'less': 1L, 'being': ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'exclusive': 2L, 'producer': 1L, 'tribe': ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'taxi': 1L, 'salon': 1L, 'gangs': 1L, 'being': ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'houston': 1L, 'frankie': 1L, 'labels': ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'phenomenon': 1L, 'deborash': 1L, ..."


In [5]:
## Compare top words according to word counts to TF-IDF:

## CREATING TF IDF
people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])

In [6]:
elton = people[people['name']=='Elton John']
print elton

+-------------------------------+------------+-------------------------------+
|              URI              |    name    |              text             |
+-------------------------------+------------+-------------------------------+
| <http://dbpedia.org/resour... | Elton John | sir elton hercules john cb... |
+-------------------------------+------------+-------------------------------+
+-------------------------------+-------------------------------+
|           word_count          |             tfidf             |
+-------------------------------+-------------------------------+
| {'all': 1L, 'six': 1L, 'pr... | {'all': 1.6431112434912472... |
+-------------------------------+-------------------------------+
[? rows x 5 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.


In [7]:
# What are the 3 words in his articles with highest word counts?
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name=['word','count'])
elton_word_count_table.sort('count', ascending = False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


In [8]:
# What are the 3 words in his articles with highest TF-IDF?
elton_word_count_table_tfidf = elton[['tfidf']].stack('tfidf', new_column_name=['word','weight'])
elton_word_count_table_tfidf.sort('weight', ascending = False)

word,weight
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
overallelton,10.9864953892
tonightcandle,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


In [9]:
## MEASURING DISTANCE
victoria = people[people['name']=='Victoria Beckham']
paul = people[people['name']=='Paul McCartney']

In [10]:
## What’s the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham’?
graphlab.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])

0.9567006376655429

In [11]:
## What’s the cosine distance between the articles on ‘Elton John’ and ‘Paul McCartney’?
graphlab.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])

0.8250310029221779

In [12]:
## Which one of the two is closest to Elton John? Does this result make sense to you?
## Answer: Paul McCartney is closer as it has less cosine distance, hence more similarity!

In [13]:
## Building nearest neighbors models with different input features and setting the distance metric:

knn_model1 = graphlab.nearest_neighbors.create(people, features = ['word_count'], distance = 'cosine', label = 'name')
knn_model2 = graphlab.nearest_neighbors.create(people,features = ['tfidf'], distance = 'cosine', label = 'name')

In [14]:
## What’s the most similar article, other than itself, to the one on ‘Elton John’ using word count features?
knn_model1.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [15]:
## What’s the most similar article, other than itself, to the one on ‘Elton John’ using TF-IDF features?
knn_model2.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [16]:
## What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using word count features?
knn_model1.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [17]:
## What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using TF-IDF features?
knn_model2.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
