In [None]:
import pandas as pd
import numpy as np
import os
os.chdir('D:/Downloads/vivienne/ML/Clustering&Retrieval_UW')
import matplotlib.pyplot as plt
import json
from scipy.sparse import csr_matrix # sparse matrices

#Load in the dataset
wiki = pd.read_csv('people_wiki.csv')

#Extract word count vectors
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

word_count = load_sparse_csr('people_wiki_word_count.npz')
with open('people_wiki_map_index_to_word.json') as file:    
    map_index_to_word = json.load(file)

#Find nearest neighbors using word count vectors
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)

print(wiki[wiki['name'] == 'Barack Obama'])
#1st arg: word count vector
distances, indices = model.kneighbors(word_count[35817], n_neighbors=10)

neighbors = pd.DataFrame(data={'distance':distances.flatten()},
                               index=indices.flatten())
#display the query results, the indices of and distances to the 10 NN
print(wiki.join(neighbors).sort_values(by = 'distance', ascending = True)\
      [['name','distance']][0:10])

#Interpret the nearest neighbors
def unpack_dict(matrix, map_index_to_word):
    table = sorted(map_index_to_word, key=map_index_to_word.get)
        
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in zip([table[word_id] for word_id in \
                                 indices[indptr[i]:indptr[i+1]] ], \
                                 data[indptr[i]:indptr[i+1]].tolist())} \
            for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)

#A utility function displays a dictionary in tabular form
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia 
    page.
    """
    row = wiki[wiki['name'] == name]
    dict_word_count = row['word_count'].iloc[0]
    word_count_table = pd.DataFrame(dict_word_count.items(), 
                                    columns=['word','count'])
    word_count_table = word_count_table.sort_values(by='count', 
                                                    ascending=False)
    return word_count_table

obama_words = top_words('Barack Obama')
print(obama_words)

barrio_words = top_words('Francisco Barrio')
print(barrio_words)

combined_words = obama_words.join(barrio_words, on='word')