# Document retrieval from wikipedia data



In [48]:
import sklearn
import pandas as pd
import numpy as np

# Load some text data - from wikipedia, pages on people

In [3]:
people = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [4]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [5]:
len(people)

59071

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [6]:
obama = people[people['name'] == 'Barack Obama']

In [7]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [8]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Exploring the entry for actor George Clooney

In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Get the word counts for Obama article

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(obama['text'])

In [34]:

obama_word_count_df = pd.DataFrame(
    {'word': vectorizer.get_feature_names(), 'count': X.toarray()[0]})
obama_word_count_df = obama_word_count_df.sort_values('count', ascending=False)
obama_word_count_df.head()

Unnamed: 0,count,word
242,40,the
115,30,in
28,21,and
162,18,of
245,14,to


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [91]:
import scipy.sparse as sp
import numpy as np
from sklearn.feature_extraction.text import (TfidfTransformer,
                                             TfidfVectorizer,
                                             _document_frequency)


class PriscillasTfidfTransformer(TfidfTransformer):   
    def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples) / df) # + 1.0
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self
    

class PriscillasTfidfVectorizer(TfidfVectorizer):  
    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):

        super(PriscillasTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self._tfidf = PriscillasTfidfTransformer(norm=norm, use_idf=use_idf,
                                       smooth_idf=smooth_idf,
                                       sublinear_tf=sublinear_tf)
        
def top_tfidf_features(row, features, top_n=25):
    df = pd.DataFrame({'feature': features, 'tfidf': row})
    return df.sort_values('tfidf', ascending=False).head(top_n)


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = people['text']

In [94]:
# tfidf = tf(no norm) * log(N/df)
vectorizer = PriscillasTfidfVectorizer(norm=None, use_idf=True, smooth_idf=False)  
vectorizer.fit(corpus)

PriscillasTfidfVectorizer(analyzer='word', binary=False,
             decode_error='strict', dtype=<type 'numpy.int64'>,
             encoding='utf-8', input='content', lowercase=True, max_df=1.0,
             max_features=None, min_df=1, ngram_range=(1, 1), norm=None,
             preprocessor=None, smooth_idf=False, stop_words=None,
             strip_accents=None, sublinear_tf=False,
             token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
             use_idf=True, vocabulary=None)

In [95]:

features = vectorizer.get_feature_names()
row = np.squeeze(vectorizer.transform(obama['text']).toarray())
top_tfidf_features(row, features, 20)

Unnamed: 0,feature,tfidf
358557,obama,43.295653
45073,act,27.678223
259220,iraq,17.747379
138583,control,14.887061
292511,law,14.722936
365437,ordered,14.533374
328575,military,13.115933
417808,response,12.784385
258665,involvement,12.784385
155936,democratic,12.410689


Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [96]:
clinton = people[people['name'] == 'Bill Clinton']

In [97]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [122]:

from sklearn.metrics.pairwise import cosine_similarity

def calc_tfidf(vect, text):
    return vect.transform(text).toarray()


clinton_tfidf = calc_tfidf(vectorizer, clinton['text'])
obama_tfidf = calc_tfidf(vectorizer, obama['text'])

1 - cosine_similarity(clinton_tfidf, obama_tfidf)

array([[ 0.83303112]])

In [123]:
clinton_tfidf.shape

(1, 548429)

In [124]:
clinton_tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [101]:
beckham_tfidf = vectorizer.transform(beckham['text']).toarray()
1- cosine_similarity(obama_tfidf, beckham_tfidf)

array([[ 0.97910435]])


# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [134]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors()

model.fit(tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [135]:
dist, ind = model.kneighbors(obama_tfidf)

In [136]:
dist

array([[ 102.63976054,  103.24219261,  103.299029  ,  103.3131305 ,
         103.31773757]])

In [137]:
ind

array([[35817, 24478, 38376, 38714, 57108]])

In [138]:

people.iloc[np.squeeze(ind), :]['name']

35817               Barack Obama
24478                  Joe Biden
38376             Samantha Power
38714    Eric Stern (politician)
57108     Hillary Rodham Clinton
Name: name, dtype: object

As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [27]:
swift = people[people['name'] == 'Taylor Swift']

In [28]:
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.76231884058,2
0,Alicia Keys,0.764705882353,3
0,Jordin Sparks,0.769633507853,4
0,Leona Lewis,0.776119402985,5


In [29]:
jolie = people[people['name'] == 'Angelina Jolie']

In [30]:
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.784023668639,2
0,Julianne Moore,0.795857988166,3
0,Billy Bob Thornton,0.803069053708,4
0,George Clooney,0.8046875,5


In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [33]:
knn_model.query(arnold)

PROGRESS: Starting pairwise querying...
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00169288  | 23.009ms     |
PROGRESS: | Done         |         | 100         | 151.716ms    |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.818918918919,2
0,John Kitzhaber,0.824615384615,3
0,Lincoln Chafee,0.833876221498,4
0,Anthony Foxx,0.833910034602,5


# Assignments

## Top word count words for Elton John

In [12]:
john = people[people['name'] == 'Elton John']
john

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'all': 1, 'least': 1, 'producer': 1, 'heavi ..."

tfidf
"{'all': 1.6431112434912472, ..."


In [16]:
john[['word_count']]

# john[['word_count']]  is a table, SFrame
# john['word_count'] is a column, SArray

word_count
"{'all': 1, 'least': 1, 'producer': 1, 'heavi ..."


In [30]:
# john['word_count'] = graphlab.text_analytics.count_words(john['text'])
john[['word_count']]\
    .stack('word_count', new_column_name = ['word','count'])\
    .sort('count', ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
john,7
he,7
on,6
award,5


## Top TF-IDF words for Elton John

In [31]:
john[['tfidf']] \
    .stack('tfidf', new_column_name = ['word','tfidf']) \
    .sort('tfidf', ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
tonightcandle,10.9864953892
overallelton,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## The cosine distance between 'Elton John's and 'Victoria Beckham's articles (represented with TF-IDF) falls within which range?

In [32]:
victoria = people[people['name'] == 'Victoria Beckham']
victoria

URI,name,text,word_count
<http://dbpedia.org/resou rce/Victoria_Beckham> ...,Victoria Beckham,victoria caroline beckham ne adams born 17 april ...,"{'millionin': 1, 'saying': 1, 'cameo': 1, ..."

tfidf
"{'millionin': 7.728398851203712, ..."


In [21]:
graphlab.distances.cosine(john['tfidf'][0], victoria['tfidf'][0])

0.9567006376655429

## The cosine distance between 'Elton John's and 'Paul McCartney's articles (represented with TF-IDF) falls within which range?

In [23]:
paul = people[people['name'] == 'Paul McCartney']

In [24]:
graphlab.distances.cosine(john['tfidf'][0], paul['tfidf'][0])

0.8250310029221779

## Who is the nearest neighbor to 'Elton John' using raw word counts?

In [38]:
knn_count_model = graphlab.nearest_neighbors.create(
    people, features=['word_count'], label='name', distance='cosine')

In [39]:
knn_count_model.query(john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


## Who is the nearest neighbor to 'Elton John' using TF-IDF?

In [36]:
knn_tfidf_model = graphlab.nearest_neighbors.create(
    people, features=['tfidf'], label='name', distance='cosine')

In [37]:
knn_tfidf_model.query(john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


## Who is the nearest neighbor to 'Victoria Beckham' using raw word counts?

In [40]:
knn_count_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


## Who is the nearest neighbor to 'Victoria Beckham' using TF-IDF?

In [42]:
knn_tfidf_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
