In [2]:
import nltk
import ast
from nltk import wordpunct_tokenize
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
import pandas as pd
import numpy as np

In [14]:
# construct data
files = ['api_data_usacomp.csv', 'api_data_5+mil.csv', 'api_data_2_5_mil.csv', 'api_data_1_2_mil.csv',
         'api_data_250_500k.csv']#, 'api_data_100_250k.csv']
df = pd.DataFrame()
for file in files:
    data = pd.read_csv(file, names=['City', 'Longitude', 'Latitude', 'Ratings', 
                                         'ObjectNames', 'Description'])
    df = pd.concat([df, data], axis=0)
df = df[df['Description'] != '[]']
df.reset_index(inplace=True)
p = df['Description']

In [15]:
# cosine similarity 
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = .7, min_df = 1)
td_matrix = vectorizer.fit_transform([x for x in df['Description']])
td_matrix_np = td_matrix.toarray()
td_matrix_np = normalize(td_matrix_np)
docs_compressed, s, words_compressed = svds(td_matrix, k=100)
words_compressed = words_compressed.transpose()
docs_compressed_normed = normalize(docs_compressed)
word_to_index = vectorizer.vocabulary_
index_to_word = {i:t for t,i in word_to_index.items()}
print(docs_compressed.shape,s.shape,words_compressed.shape)

(1685, 100) (100,) (227323, 100)


In [27]:
# driver code
# query = input("Type a query: ")
query = "beaches"
query = vectorizer.transform([query]).toarray()
query_vec = normalize(np.dot(query, words_compressed)).squeeze()
def closest_projects_to_query(query_vec_in, k = 5):
    sims = docs_compressed_normed.dot(query_vec_in)
    asort = np.argsort(-sims)[:k+1]
    return [(i, df['City'][i], sims[i]) for i in asort[1:]]

for i, proj, sim in closest_projects_to_query(query_vec):
    objects_str = df['ObjectNames'][i]
    descr_str = df['Description'][i]
    ratings_str = df['Ratings'][i]
    objects = ast.literal_eval(objects_str)
    descriptions = ast.literal_eval(descr_str)
    description_sims = [normalize(np.dot(vectorizer.transform([i]).toarray(), words_compressed)).squeeze() 
                        for i in descriptions]
    description_sims = [i.dot(query_vec) for i in description_sims]
    idx = np.argpartition(description_sims, max(-len(description_sims),-3))[-3:]
    ratings = ast.literal_eval(ratings_str)
    ratings = [int(ratings[i][0]) for i in range(len(ratings))]
    print(ratings)
    print("{} {:.4f},\n {}".format(proj, sim, [objects[i] for i in idx]))

[3, 3]
Calicut 0.7330,
 ['EMS Stadium', 'Kozhikode Beach']
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Quilon 0.6660,
 ['Chinnakada Clock Tower', 'Mahatma Gandhi Park', 'Kollam Beach']
[3, 3, 3, 3, 3]
Hollywood 0.6468,
 ['Greynolds Park', 'John U. Lloyd Beach State Park', 'Fulford-by-the-Sea Monument']
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Miami 0.6406,
 ['New World Center', 'South Pointe Park', 'South Beach']
[3, 3, 3, 3, 3, 3]
Huntington Beach 0.6191,
 ['Bolsa Chica Ecological Reserve', 'Bolsa Chica State Beach', 'Bolsa Chica State Beach']
