In [1]:
# imports
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import operator

from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import DistanceMetric

## 1. Load people data from Wiki 

In [2]:
ppl = pd.read_csv('people_wiki.csv')
print ppl.shape

(59071, 3)


In [3]:
ppl.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
swift = ppl[ppl['name'] == 'Taylor Swift']
swift

Unnamed: 0,URI,name,text
54264,<http://dbpedia.org/resource/Taylor_Swift>,Taylor Swift,taylor alison swift born december 13 1989 is a...


In [5]:
print swift.text.values

[ 'taylor alison swift born december 13 1989 is an american singersongwriter raised in wyomissing pennsylvania swift moved to nashville tennessee at the age of 14 to pursue a career in country music she signed with the independent label big machine records and became the youngest songwriter ever hired by the sonyatv music publishing house the release of swifts selftitled debut album in 2006 established her as a country music star her third single our song made her the youngest person to singlehandedly write and perform a numberone song on the hot country songs chart she received a best new artist nomination at the 2008 grammy awards swifts second album fearless was released in 2008 buoyed by the pop crossover success of the singles love story and you belong with me fearless became the bestselling album of 2009 in the us the album won four grammy awards with swift becoming the youngest ever album of the year winner swifts third and fourth albums 2010s speak now and 2012s red both sold o

In [8]:
depp = ppl[ppl['name'] == 'Johnny Depp']
depp

Unnamed: 0,URI,name,text
48436,<http://dbpedia.org/resource/Johnny_Depp>,Johnny Depp,john christopher johnny depp ii born june 9 19...


In [9]:
beckham = ppl[ppl['name'] == 'David Beckham']
beckham

Unnamed: 0,URI,name,text
23386,<http://dbpedia.org/resource/David_Beckham>,David Beckham,david robert joseph beckham obe bkm born 2 may...


In [10]:
obama = ppl[ppl['name'] == 'Barack Obama']
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


## 2. Explore word count

In [11]:
wordcount_swift = Counter(" ".join(swift['text'].values.tolist()).split(" "))

In [12]:
print wordcount_swift

Counter({'the': 28, 'and': 14, 'in': 12, 'of': 8, 'album': 7, 'awards': 7, 'swift': 7, 'music': 7, 'a': 7, 'to': 6, 'swifts': 5, 'million': 5, 'her': 5, 'country': 5, 'she': 4, 'grammy': 4, 'as': 4, 'has': 3, 'us': 3, 'ever': 3, 'sold': 3, 'youngest': 3, 'by': 3, 'one': 3, 'copies': 3, 'albums': 3, 'with': 3, 'week': 3, 'singles': 3, 'over': 2, 'both': 2, '2014': 2, 'song': 2, 'songwriter': 2, 'were': 2, 'now': 2, 'association': 2, 'release': 2, 'billboard': 2, 'for': 2, 'won': 2, 'fearless': 2, 'nashville': 2, 'career': 2, 'first': 2, 'speak': 2, '1989': 2, 'was': 2, 'songwriters': 2, 'more': 2, 'on': 2, 'released': 2, 'than': 2, 'made': 2, 'single': 2, 'seven': 2, 'is': 2, 'it': 2, 'an': 2, 'at': 2, 'hot': 2, 'became': 2, 'you': 2, 'third': 2, '2008': 2, 'songs': 2, 'singersongwriter': 1, 'charities': 1, 'actress': 1, 'years': 1, 'four': 1, 'shake': 1, 'its': 1, 'fifth': 1, 'previous': 1, 'machine': 1, 'personal': 1, 'winner': 1, 'day': 1, 'only': 1, 'other': 1, '2010': 1, '2012': 1,

In [13]:
# top 10 words with the most counts in the swift article 
wordcount_swift.most_common(10)

[('the', 28),
 ('and', 14),
 ('in', 12),
 ('of', 8),
 ('album', 7),
 ('awards', 7),
 ('swift', 7),
 ('music', 7),
 ('a', 7),
 ('to', 6)]

In [14]:
# top 10 words with the most counts in the depp article 
wordcount_depp = Counter(" ".join(depp['text'].values.tolist()).split(" "))
wordcount_depp.most_common(10)

[('the', 31),
 ('in', 19),
 ('and', 14),
 ('of', 10),
 ('for', 9),
 ('has', 7),
 ('he', 7),
 ('actor', 6),
 ('award', 5),
 ('depp', 5)]

In [15]:
# top 10 words with the most counts in the beckham article 
wordcount_beckham = Counter(" ".join(beckham['text'].values.tolist()).split(" "))
wordcount_beckham.most_common(10)

[('the', 21),
 ('in', 16),
 ('he', 12),
 ('and', 12),
 ('beckham', 8),
 ('for', 7),
 ('of', 7),
 ('his', 6),
 ('with', 6),
 ('was', 5)]

In [16]:
# top 10 words with the most counts in the obama article 
wordcount_obama = Counter(" ".join(obama['text'].values.tolist()).split(" "))
wordcount_obama.most_common(10)

[('the', 40),
 ('in', 30),
 ('and', 21),
 ('of', 18),
 ('to', 14),
 ('his', 11),
 ('obama', 9),
 ('act', 8),
 ('he', 7),
 ('a', 7)]

## 3. Compute and explore TF-IDFs

In [17]:
countvect = CountVectorizer(stop_words = 'english')
wordcount_corpus = countvect.fit_transform(ppl['text'])

In [18]:
feature_name = countvect.get_feature_names()

In [19]:
tfidftransformer = TfidfTransformer(norm = None)
tfidf = tfidftransformer.fit_transform(wordcount_corpus)

In [20]:
# examine TF-IDFs on Taylor Swift
tfidf_swift = tfidf[swift.index]
feature_swift = map(lambda x: feature_name[x], tfidf_swift.indices)
tfidf_swift_vec = dict(zip(feature_swift, tfidf_swift.data))

In [32]:
tfidf_swift_sorted = sorted(tfidf_swift_vec.items(), key = operator.itemgetter(1))
tfidf_swift_sorted.reverse()
tfidf_swift_sorted[0:10]

[(u'swift', 52.177389772964872),
 (u'swifts', 44.477349322521036),
 (u'album', 24.15760450729659),
 (u'awards', 22.986851344203274),
 (u'grammy', 20.339100444823799),
 (u'million', 20.08750268378023),
 (u'music', 19.548955401685085),
 (u'country', 18.370658605378864),
 (u'copies', 16.49648489813374),
 (u'fearless', 15.447664881642417)]

In [33]:
# function to find top tdidf 

def top_tdidf(person):
    tfidf_person = tfidf[person.index]
    feature_person = map(lambda x: feature_name[x], tfidf_swift.indices)
    tfidf_person_vec = dict(zip(feature_person, tfidf_person.data))
    tfidf_person_sorted = sorted(tfidf_person_vec.items(), key = operator.itemgetter(1))
    tfidf_person_sorted.reverse()
    return tfidf_person_sorted[0:10]

In [34]:
top_tdidf(depp)

[(u'efforts', 42.926574681001846),
 (u'love', 41.364701255791083),
 (u'star', 24.174851770943292),
 (u'chart', 23.84258537614884),
 (u'age', 23.746386932252264),
 (u'grammy', 22.677390524460961),
 (u'big', 22.480167480524344),
 (u'honored', 15.165007774276788),
 (u'hot', 15.120806368274012),
 (u'disaster', 15.087722122744415)]

In [35]:
top_tdidf(beckham)

[(u'bestselling', 64.757536158015171),
 (u'known', 20.947698035751195),
 (u'hired', 17.12960648556264),
 (u'selftitled', 15.67138241317334),
 (u'new', 15.427772226587306),
 (u'year', 11.676088043889754),
 (u'natural', 11.404322520990725),
 (u'popfocused', 11.360474061829017),
 (u'perform', 11.342308632680378),
 (u'copies', 11.293365137302578)]

In [36]:
top_tdidf(obama)

[(u'supports', 52.277113834307315),
 (u'act', 35.674051187909924),
 (u'publishing', 21.741727931276476),
 (u'released', 20.721855882367674),
 (u'december', 18.884330378434285),
 (u'trouble', 17.526980051210632),
 (u'singles', 17.114203144108135),
 (u'getting', 16.409249536745939),
 (u'previous', 15.780836746511332),
 (u'career', 12.077519128423219)]

## 4. Compute distance between articles 

In [41]:
# manually compute distances
print('Distance btw Swift and Depp: {}' .format(1-cosine_similarity(tfidf[swift.index], tfidf[depp.index])))
print('Distance btw Swift and Beckham: {}' .format(1-cosine_similarity(tfidf[swift.index], tfidf[beckham.index])))
print('Distance btw Swift and Obama: {}' .format(1-cosine_similarity(tfidf[swift.index], tfidf[obama.index])))

Distance btw Swift and Depp: [[ 0.9595084]]
Distance btw Swift and Beckham: [[ 0.97621295]]
Distance btw Swift and Obama: [[ 0.96917793]]


In [42]:
print('Distance btw Depp and Beckham: {}' .format(1-cosine_similarity(tfidf[depp.index], tfidf[beckham.index])))
print('Distance btw Depp and Obama: {}' .format(1-cosine_similarity(tfidf[depp.index], tfidf[obama.index])))

Distance btw Depp and Beckham: [[ 0.97129603]]
Distance btw Depp and Obama: [[ 0.98060606]]


In [43]:
print('Distance btw Beckham and Obama: {}' .format(1-cosine_similarity(tfidf[beckham.index], tfidf[obama.index])))

Distance btw Beckham and Obama: [[ 0.97443419]]


## 5. Build and explore nearest neighbors model

In [52]:
nb1 = NearestNeighbors(n_neighbors = 5, algorithm = 'brute', metric = 'euclidean')

nb2 = NearestNeighbors(n_neighbors = 5, algorithm = 'brute')

nb3 = NearestNeighbors(n_neighbors = 5, algorithm = 'brute', metric = 'manhattan')

In [53]:
nb1.fit(tfidf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [54]:
nb2.fit(tfidf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [55]:
nb3.fit(tfidf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='manhattan',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [61]:
# find 10 nearest neighbors of swift using model 1
dist1, index1 = nb1.kneighbors(X = tfidf[swift.index], n_neighbors = 10, return_distance = True)

In [57]:
zip(map(lambda x: ppl['name'].iloc[x], index1), dist1)

[(54264      Taylor Swift
  6215          Amy Grant
  25612     Rodney Atkins
  30135    Kellie Pickler
  41723     Stacie Orrico
  54802     Darius Rucker
  55170     Alison Krauss
  17505    George Michael
  57446      Richard Marx
  22894     Dido (singer)
  Name: name, dtype: object,
  array([   0.        ,  118.96691879,  119.96919175,  120.98169082,
          122.43973736,  122.4679805 ,  122.5878386 ,  122.6106631 ,
          123.06504989,  123.41357024]))]

In [60]:
# use model 2
dist2, index2 = nb2.kneighbors(X = tfidf[swift.index], n_neighbors = 10, return_distance = True)
zip(map(lambda x: ppl['name'].iloc[x], index2), dist2)

[(54264      Taylor Swift
  6215          Amy Grant
  25612     Rodney Atkins
  30135    Kellie Pickler
  41723     Stacie Orrico
  54802     Darius Rucker
  55170     Alison Krauss
  17505    George Michael
  57446      Richard Marx
  22894     Dido (singer)
  Name: name, dtype: object,
  array([   0.        ,  118.96691879,  119.96919175,  120.98169082,
          122.43973736,  122.4679805 ,  122.5878386 ,  122.6106631 ,
          123.06504989,  123.41357024]))]

In [62]:
# use model 3
dist3, index3 = nb3.kneighbors(X = tfidf[swift.index], n_neighbors = 10, return_distance = True)
zip(map(lambda x: ppl['name'].iloc[x], index3), dist3)

[(54264          Taylor Swift
  25612         Rodney Atkins
  6215              Amy Grant
  41723         Stacie Orrico
  33780    Sam Smith (singer)
  24649        Agnes Carlsson
  22894         Dido (singer)
  54802         Darius Rucker
  56720       Sylvia (singer)
  47585    Corinne Bailey Rae
  Name: name, dtype: object,
  array([    0.        ,  1291.70450375,  1310.05327475,  1310.1304238 ,
          1338.63503958,  1344.92404505,  1344.94369664,  1351.51059666,
          1362.57463175,  1371.98668887]))]