In [1]:
import graphlab
import sframe    # see below for install instruction
import sklearn
import matplotlib.pyplot as plt          # plotting
import numpy as np                       # dense matrices
from scipy.sparse import csr_matrix      # sparse matrices
%matplotlib inline

A newer version of GraphLab Create (v2.0.1) is available! Your current version is v1.10.1.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


This non-commercial license of GraphLab Create is assigned to tamim_1382@yahoo.com and will expire on September 24, 2016. For commercial licensing options, visit https://turi.com/buy/.


[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: /tmp/graphlab_server_1468230590.log
[INFO] sframe.cython.cy_server: SFrame v1.10.1 started. Logging /tmp/sframe_server_1468230594.log


In [3]:
wiki = sframe.SFrame('people_wiki.gl/')
wiki = wiki.add_row_number()

In [4]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    print(loader.files)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [5]:
word_count = load_sparse_csr('people_wiki_word_count.npz')

['indices', 'indptr', 'shape', 'data']


In [6]:
map_index_to_word = sframe.SFrame('people_wiki_map_index_to_word.gl/')

In [7]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [8]:
wiki[wiki['name'] == 'Barack Obama']

id,URI,name,text
35817,<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


In [9]:
distances, indices = model.kneighbors(word_count[35817], n_neighbors=10) # 1st arg: word count vector

In [10]:
neighbors = sframe.SFrame({'distance':distances.flatten(), 'id':indices.flatten()})
print wiki.join(neighbors, on='id').sort('distance')[['id','name','distance']]

+-------+----------------------------+---------------+
|   id  |            name            |    distance   |
+-------+----------------------------+---------------+
| 35817 |        Barack Obama        |      0.0      |
| 24478 |         Joe Biden          | 33.0756708171 |
| 28447 |       George W. Bush       | 34.3947670438 |
| 35357 |      Lawrence Summers      | 36.1524549651 |
| 14754 |        Mitt Romney         | 36.1662826401 |
| 13229 |      Francisco Barrio      | 36.3318042492 |
| 31423 |       Walter Mondale       | 36.4005494464 |
| 22745 | Wynn Normington Hugh-Jones | 36.4965751818 |
| 36364 |         Don Bonker         |  36.633318168 |
|  9210 |        Andy Anstett        | 36.9594372252 |
+-------+----------------------------+---------------+
[10 rows x 3 columns]



In [11]:
def unpack_dict(matrix, map_index_to_word):
    table = list(map_index_to_word.sort('index')['category'])
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())} \
               for i in xrange(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)

In [12]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_table = row[['word_count']].stack('word_count', new_column_name=['word','count'])
    return word_count_table.sort('count', ascending=False)

obama_words = top_words('Barack Obama')
print obama_words

barrio_words = top_words('Francisco Barrio')
print barrio_words

+-------+-------+
|  word | count |
+-------+-------+
|  the  |   40  |
|   in  |   30  |
|  and  |   21  |
|   of  |   18  |
|   to  |   14  |
|  his  |   11  |
| obama |   9   |
|  act  |   8   |
|   he  |   7   |
|   a   |   7   |
+-------+-------+
[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
+-----------+-------+
|    word   | count |
+-----------+-------+
|    the    |   36  |
|     of    |   24  |
|    and    |   18  |
|     in    |   17  |
|     he    |   10  |
|     to    |   9   |
| chihuahua |   7   |
|     a     |   6   |
|  governor |   6   |
|     as    |   5   |
+-----------+-------+
[225 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [13]:
combined_words = obama_words.join(barrio_words, on='word')

In [14]:
combined_words = combined_words.rename({'count':'Obama', 'count.1':'Barrio'})

In [15]:
combined_words.sort('Obama', ascending=False)

word,Obama,Barrio
the,40,36
in,30,17
and,21,18
of,18,24
to,14,9
his,11,5
he,7,10
a,7,6
as,6,5
was,5,4


In [16]:
common_words = set(combined_words[0:5]['word'])  # YOUR CODE HERE

def has_top_words(word_count_vector):
    # extract the keys of word_count_vector and convert it to a set
    unique_words = word_count_vector.keys()   # YOUR CODE HERE
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return common_words.issubset(unique_words)  # YOUR CODE HERE

wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)

# use has_top_words column to answer the quiz question
contains = wiki['has_top_words'].sum() # YOUR CODE HERE

In [17]:
print 'Output from your function:', has_top_words(wiki[32]['word_count'])
print 'Correct output: True'
print 'Also check the length of unique_words. It should be 167'

print 'Output from your function:', has_top_words(wiki[33]['word_count'])
print 'Correct output: False'
print 'Also check the length of unique_words. It should be 188'

Output from your function: True
Correct output: True
Also check the length of unique_words. It should be 167
Output from your function: False
Correct output: False
Also check the length of unique_words. It should be 188


In [18]:
print contains

56066


In [19]:
threes = wiki.filter_by(['Barack Obama','Joe Biden','George W. Bush'], 'name')

In [48]:
import numpy as np
import pandas as pd
l = list(threes['word_count'])
mat = pd.DataFrame(l).fillna(0)


row2 = wiki[wiki['name'] == 'Barack Obama']
row1 = wiki[wiki['name'] == 'Joe Biden']
row3 = wiki[wiki['name'] == 'George W. Bush']

In [50]:
print graphlab.toolkits.distances.euclidean(row1['word_count'][0], row2['tf_idf'][0])
print graphlab.toolkits.distances.euclidean(row1['word_count'][0], row3['tf_idf'][0])
print graphlab.toolkits.distances.euclidean(row2['tf_idf'][0], row3['tf_idf'][0])

123.29745601
134.426722219
128.840419901


In [21]:
l = list(threes['word_count'])

In [22]:
sklearn.metrics.pairwise.euclidean_distances(mat)

array([[  0.        ,  32.75667871,  33.07567082],
       [ 32.75667871,   0.        ,  34.39476704],
       [ 33.07567082,  34.39476704,   0.        ]])

In [23]:
bush_words = top_words('George W. Bush')

In [24]:
bush_words

word,count
the,39
in,22
of,14
and,14
bush,12
to,11
he,8
his,6
as,6
president,6


In [25]:
combined_words = obama_words.join(bush_words, on='word')

In [26]:
combined_words

word,count,count.1
the,40,39
in,30,22
of,18,14
and,21,14
to,14,11
he,7,8
his,11,6
as,6,6
president,4,6
a,7,6


In [27]:
combined_words.sort('count', ascending=False)

word,count,count.1
the,40,39
in,30,22
and,21,14
of,18,14
to,14,11
his,11,6
act,8,3
he,7,8
a,7,6
as,6,6


In [28]:
tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')

['indices', 'indptr', 'shape', 'data']


In [29]:
wiki['tf_idf'] = unpack_dict(tf_idf, map_index_to_word)

In [30]:
wiki.head()

id,URI,name,text,word_count
0,<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'selection': 1, 'carltons': 1, 'being': ..."
1,<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1, 'thomas': 1, 'they': 1, ..."
2,<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1, 'issued': 1, 'mainly': 1, 'nominat ..."
3,<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'englishreading': 1, 'all': 1, 'bauforschu ..."
4,<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'they': 1, 'gangstergenka': 1, ..."
5,<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'now': 1, 'currently': 1, 'less': 1, 'being' ..."
6,<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'exclusive': 2, 'producer': 1, 'show' ..."
7,<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'taxi': 1, 'salon': 1, 'gangs': 1, 'being': 1, ..."
8,<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'houston': 1, 'frankie': 1, 'labels': 1, ..."
9,<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'phenomenon': 1, 'deborash': 1, ..."

has_top_words,tf_idf
1,"{'selection': 3.836578553093086, ..."
1,"{'precise': 6.44320060695519, ..."
1,"{'just': 2.7007299687108643, ..."
1,"{'englishreading': 10.293348208665249, ..."
0,"{'they': 1.8993401178193898, ..."
0,"{'now': 1.96695239252401, 'currently': ..."
1,"{'exclusive': 10.455187230695827, ..."
1,"{'taxi': 6.0520214560945025, ..."
1,"{'houston': 3.935505942157149, ..."
1,"{'phenomenon': 5.750053426395245, ..."


In [31]:
model_tf_idf = NearestNeighbors(metric='euclidean', algorithm='brute')
model_tf_idf.fit(tf_idf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [32]:
distances, indices = model_tf_idf.kneighbors(tf_idf[35817], n_neighbors=100)

In [33]:
neighbors = sframe.SFrame({'distance':distances.flatten(), 'id':indices.flatten()})
print wiki.join(neighbors, on='id').sort('distance')[['id', 'name', 'distance']]

+-------+-------------------------+---------------+
|   id  |           name          |    distance   |
+-------+-------------------------+---------------+
| 35817 |       Barack Obama      |      0.0      |
|  7914 |      Phil Schiliro      | 106.861013691 |
| 46811 |      Jeff Sessions      | 108.871674216 |
| 44681 |  Jesse Lee (politician) | 109.045697909 |
| 38376 |      Samantha Power     | 109.108106165 |
|  6507 |       Bob Menendez      | 109.781867105 |
| 38714 | Eric Stern (politician) |  109.95778808 |
| 44825 |      James A. Guest     | 110.413888718 |
| 44368 |   Roland Grossenbacher  |  110.4706087  |
| 33417 |      Tulsi Gabbard      | 110.696997999 |
+-------+-------------------------+---------------+
[100 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [34]:
def top_words_tf_idf(name):
    row = wiki[wiki['name'] == name]
    word_count_table = row[['tf_idf']].stack('tf_idf', new_column_name=['word','weight'])
    return word_count_table.sort('weight', ascending=False)

obama_tf_idf = top_words_tf_idf('Barack Obama')
print obama_tf_idf

schiliro_tf_idf = top_words_tf_idf('Phil Schiliro')
print schiliro_tf_idf

+-------------+---------------+
|     word    |     weight    |
+-------------+---------------+
|    obama    | 43.2956530721 |
|     act     |  27.678222623 |
|     iraq    |  17.747378588 |
|   control   | 14.8870608452 |
|     law     | 14.7229357618 |
|   ordered   | 14.5333739509 |
|   military  | 13.1159327785 |
|   response  | 12.7843852412 |
| involvement | 12.7843852412 |
|  democratic | 12.4106886973 |
+-------------+---------------+
[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
+-----------------+---------------+
|       word      |     weight    |
+-----------------+---------------+
|     schiliro    | 21.9729907785 |
|      staff      | 15.8564416352 |
|  congressional  | 13.5470876563 |
| daschleschiliro | 10.9864953892 |
|      obama      | 9.62125623824 |
|      waxman     | 9.04058524017 |
|    president    | 9.03358661416 |
|     2014from    | 8.68391029623 |
|    

In [35]:
top_words = set(obama_tf_idf.join(schiliro_tf_idf,on='word').rename({'weight':'obama', 'weight.1':'schiliro'}).sort(
    'obama',ascending=False)['word'][0:5])  # YOUR CODE HERE

def has_top_words(word_count_vector):
    # extract the keys of word_count_vector and convert it to a set
    unique_words = word_count_vector.keys()   # YOUR CODE HERE
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return top_words.issubset(unique_words)  # YOUR CODE HERE

wiki['has_top_words'] = wiki['word_count'].apply(has_top_words)

# use has_top_words column to answer the quiz question
wiki['has_top_words'].sum()

14

In [36]:
set(obama_tf_idf.join(schiliro_tf_idf,on='word').rename({'weight':'obama', 'weight.1':'schiliro'}).sort(
    'obama',ascending=False)['word'][0:5])

{'democratic', 'law', 'obama', 'presidential', 'senate'}

In [38]:
row2 = wiki[wiki['name'] == 'Barack Obama']
row1 = wiki[wiki['name'] == 'Joe Biden']

In [39]:
row2['tf_idf'][0]

{'13th': 4.9534091674263925,
 '1961': 3.3207419573634955,
 '1992': 2.278351314316948,
 '1996': 2.135691193468776,
 '1997': 2.1298344522079455,
 '20': 4.88376320446593,
 '2000in': 6.250296940830698,
 '2004': 5.071033082507702,
 '2007': 1.4879730697555795,
 '2008': 1.5093391374786154,
 '2009': 4.693309450812809,
 '2010': 3.185667920243947,
 '2011': 5.107041270312876,
 '2012': 1.7938099524877322,
 '2012obama': 10.986495389225194,
 '2013': 1.9545642372230505,
 '4': 2.437803530749586,
 '44th': 7.0744723837970485,
 '63': 5.22130428644035,
 '8': 2.7572509724892824,
 'a': 0.039334291308082026,
 'act': 27.67822262297991,
 'address': 4.8023464982877115,
 'administration': 3.2952952917023315,
 'affordable': 6.134465125305577,
 'afghanistan': 9.4197037997671,
 'african': 3.582216271187926,
 'after': 3.7773337680052257,
 'against': 2.0079609791418744,
 'american': 3.3821333532750204,
 'americans': 4.761936959949835,
 'and': 0.01564802185902329,
 'arms': 5.030658019760364,
 'as': 0.7630171320744707,

In [40]:
biden_tf_idf = top_words_tf_idf('Joe Biden')

In [41]:
combined = obama_tf_idf.join(biden_tf_idf,on='word')

In [42]:
combined

word,weight,weight.1
obama,43.2956530721,19.2425124765
act,27.678222623,17.2988891394
iraq,17.747378588,4.43684464699
control,14.8870608452,7.44353042259
law,14.7229357618,2.45382262696
military,13.1159327785,3.27898319462
democratic,12.4106886973,9.308016523
us,11.5919426928,9.65995224404
senate,10.1642881797,10.1642881797
nominee,9.43101391473,4.71550695737


In [43]:
dist = np.linalg.norm(combined['weight']-combined['weight.1'])


In [44]:
dist

37.905330659200246

In [45]:
((combined['weight']-combined['weight.1'])*(combined['weight']-combined['weight.1'])).sum()

1436.814092383306

In [46]:
row1['tf_idf'][0]

{'15': 2.527567105940933,
 '1942': 4.106111307039189,
 '1969': 2.8490995591685433,
 '1970': 2.813766284359722,
 '1972': 2.8070152038663028,
 '1973': 2.808137223619358,
 '1988': 2.4491074905234376,
 '1991': 2.3750835225699753,
 '20': 2.441881602232965,
 '2002': 1.8753125887822302,
 '2008': 3.0186782749572307,
 '2009': 1.5644364836042695,
 '2010': 1.5928339601219734,
 '2011': 1.7023470901042919,
 '2012': 5.381429857463196,
 '3': 2.3430220318986272,
 '47th': 7.248825770941826,
 'a': 0.03371510683549888,
 'ability': 4.213415013569659,
 'about': 3.8530257976751474,
 'act': 17.298889139362444,
 'addressed': 5.425813758209666,
 'advocacy': 4.827400000733261,
 'against': 2.0079609791418744,
 'aimed': 5.1487649420592545,
 'alongside': 3.2975820523603985,
 'also': 0.4627270916162349,
 'alter': 6.453895896071939,
 'american': 1.1273777844250068,
 'an': 0.2982390890818971,
 'and': 0.014157734062925836,
 'as': 0.2543390440248236,
 'assistance': 4.941490075189183,
 'at': 0.43063857330825733,
 'attor

In [47]:
graphlab.toolkits.distances.euclidean(row1['tf_idf'][0], row2['tf_idf'][0])

123.29745600964296

AttributeError: 'module' object has no attribute 'connect'