# Document retrieval from wikipedia data

## Fire up GraphLab Create
dataset https://d396qusza40orc.cloudfront.net/phoenixassets/people_wiki.csv

In [1]:
import pandas as pd
import numpy as np

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Exploring the entry for actor George Clooney

In [8]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Get the word counts for Obama article

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
count_vectorizer = CountVectorizer()
count = count_vectorizer.fit_transform(people['text'])


In [22]:
analyzer = count_vectorizer.build_analyzer()

In [12]:
voc = count_vectorizer.vocabulary_
rvoc = dict(zip(voc.itervalues(),voc.iterkeys()))

In [26]:
def count_words(doc):
    dic = {}
    if analyzer(doc):
        M = count_vectorizer.fit_transform([doc]).toarray()[0]
        for word,index in count_vectorizer.vocabulary_.items():
            dic[word] = M[index]
    return dic

In [27]:
obama['word_count'] = obama['text'].apply(count_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [29]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{u'operations': 1, u'represent': 1, u'peace': ..."


## Sort the word counts for the Obama article

### Turning dictonary of word counts into a table

In [30]:
obama_word_count_table = pd.DataFrame(obama['word_count'].values[0].items(),columns=['word','count'])

### Sorting the word counts to show most common words at the top

In [33]:
obama_word_count_table.sort_values(by='count',ascending=False).head()

Unnamed: 0,word,count
266,the,40
220,in,30
138,and,21
126,of,18
25,to,14


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

In [35]:
#people['word_count'] = people['text'].apply(count_words)
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [36]:
tfidftransformer = TfidfTransformer(norm=None)
tfidf_weight = tfidftransformer.fit_transform(count)

In [37]:
def compute_tfidf(count,tfidf_M,voc):
    ans = []
    num = count.shape[0]
    for i in range(0,num):
        idx = np.where(count[i]!=np.zeros(count[i].shape))[1]
        dic = {}
        dic = dict((voc[index],tfidf_M[i,index]) for index in idx)
        ans.append(dic)
    return ans

In [38]:
def compute(i):
    idx = np.where(count[i]!=np.zeros(count[i].shape))[1]
    dic = {}
    return dict((rvoc[index],tfidf_weight[i,index]) for index in idx)

In [None]:
#t = map(compute,range(0,count.shape[0]))

In [None]:
#tfidf = compute_tfidf(count,tfidf_weight,rvoc)

In [None]:
#people['tfidf'] = tfidf

## Examine the TF-IDF for the Obama article

In [39]:
obama = people[people['name'] == 'Barack Obama']

In [40]:
word = [rvoc[i] for i in tfidf_weight[obama.index[0]].indices]

In [41]:
obama_word_count_table = pd.DataFrame({'word':word,'tfidf':tfidf_weight[obama.index[0]].data})
obama_word_count_table.sort_values(by='tfidf',ascending =False)

Unnamed: 0,tfidf,word
171,52.277114,obama
3,40.004063,the
177,35.674051,act
5,30.028962,in
155,21.741728,iraq
4,21.015648,and
137,20.721856,law
185,18.884330,control
11,18.074810,of
58,17.592044,us


Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [42]:
clinton = people[people['name'] == 'Bill Clinton']

In [43]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [44]:
from sklearn.metrics.pairwise import cosine_distances

In [45]:
cosine_distances(tfidf_weight[obama.index[0]],tfidf_weight[clinton.index[0]])

array([[ 0.67497775]])

In [46]:
cosine_distances(tfidf_weight[obama.index[0]],tfidf_weight[beckham.index[0]])

array([[ 0.8420454]])

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [47]:
from sklearn.neighbors import KNeighborsClassifier

In [48]:
knn_model = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model.fit(tfidf_weight,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [49]:
dist,ind= knn_model.kneighbors(tfidf_weight[obama.index[0]])

In [50]:
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
35817,-8.881784e-16,Barack Obama
24478,0.5707807,Joe Biden
57108,0.6159341,Hillary Rodham Clinton
38376,0.6249935,Samantha Power
38714,0.6497651,Eric Stern (politician)


As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [51]:
swift = people[people['name'] == 'Taylor Swift']

In [52]:
dist,ind = knn_model.kneighbors(tfidf_weight[swift.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
54264,-2.220446e-16,Taylor Swift
317,0.6161387,Carrie Underwood
27793,0.6247446,Adele
29297,0.6375446,Kelly Clarkson
1341,0.6487036,Dolly Parton


In [53]:
jolie = people[people['name'] == 'Angelina Jolie']

In [54]:
dist,ind = knn_model.kneighbors(tfidf_weight[jolie.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
39521,-4.440892e-16,Angelina Jolie
29009,0.627905,Barbara Hershey
57434,0.6337704,Glenn Close
34756,0.6438354,Maggie Smith
44992,0.6499563,Julianne Moore


In [55]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [56]:
dist,ind = knn_model.kneighbors(tfidf_weight[arnold.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
16018,-4.440892e-16,Arnold Schwarzenegger
35293,0.7397827,Paul Grant (bodybuilder)
58965,0.7465629,Bonnie Garcia
36682,0.7598034,Abel Maldonado
10499,0.7676966,David Israel


# Question 1

In [57]:
elton = people[people['name']=='Elton John']

In [58]:
word = [rvoc[i] for i in count[elton.index[0]].indices]

In [59]:
elton_word_count_table = pd.DataFrame({'word':word,'count':count[elton.index[0]].data})

In [60]:
elton_word_count_table.sort_values(by='count',ascending=False).head()

Unnamed: 0,count,word
246,27,the
244,18,in
245,15,and
237,13,of
219,9,has


In [61]:
elton_word_tfidf_table = pd.DataFrame({'word':word,'tfidf':tfidf_weight[elton.index[0]].data})

something wrong

In [62]:
elton_word_tfidf_table.sort_values(by='tfidf',ascending=False).head()

Unnamed: 0,count,word
5,27.002743,brits
178,21.29863,1988
92,20.938563,globe
188,20.465179,two
244,20.081204,in


# Question 2

In [63]:
victoria = people[people['name']=='Victoria Beckham']

In [64]:
paul = people[people['name']=='Paul McCartney']

In [65]:
cosine_distances(tfidf_weight[elton.index[0]],tfidf_weight[victoria.index[0]])

array([[ 0.85192118]])

In [67]:
cosine_distances(tfidf_weight[elton.index[0]],tfidf_weight[paul.index[0]])

array([[ 0.69231325]])

# Question 3

In [68]:
knn_model_wordcounts = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model_wordcounts.fit(count,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [69]:
knn_model_tfidf = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model_tfidf.fit(tfidf_weight,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [70]:
dist,ind = knn_model_wordcounts.kneighbors(tfidf_weight[elton.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
19923,0.237925,Elton John
28825,0.519477,Rod Stewart
29297,0.536914,Kelly Clarkson
17505,0.537179,George Michael
16423,0.5393,Usher (entertainer)


In [71]:
dist,ind = knn_model_tfidf.kneighbors(tfidf_weight[elton.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
19923,2.220446e-16,Elton John
28825,0.5893611,Rod Stewart
31595,0.6336579,Phil Collins
27793,0.6365243,Adele
26049,0.6423975,Sting (musician)


In [72]:
dist,ind = knn_model_wordcounts.kneighbors(tfidf_weight[victoria.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
50411,0.28401,Victoria Beckham
23386,0.659912,David Beckham
56064,0.668681,Yuliya Polishchuk
58438,0.676464,Mona al Mansouri
6635,0.682756,Wal%C3%A9 Adeyemi


In [73]:
dist,ind = knn_model_tfidf.kneighbors(tfidf_weight[victoria.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
50411,2.220446e-16,Victoria Beckham
23386,0.5464767,David Beckham
17264,0.7184218,Mel B
39144,0.7459557,Stephen Dow Beckham
5385,0.7518478,Hilary Alexander
