In [23]:
import requests
import ast
import nltk
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

In [4]:
# if stopwords fails make sure it's been downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def tokenize_str(string, tok_method, stemmer, stops):
#     print('haha1')
    toks = tok_method(string)
    toks = [w for w in toks if w not in stops]
    toks = [stemmer.stem(w) for w in toks]
    toks = [w.lower() for w in toks if w.isalpha()]
    return toks

In [6]:
def city_word_indices(df, tok_method, stemmer, stops):
    desc = df['Description']
    all_toks = set()
    n_rows = len(desc)
    city_word_dic = {}
    for i in range(n_rows):
        toks = tokenize_str(desc[i], wordpunct_tokenize, stemmer, stops)
        tokset = toks
        all_toks = all_toks.union(tokset)
        city_word_dic[df['City'][i]] = toks
    all_toks = list(all_toks)
    all_toks.sort()
    cities = df['City'].tolist()
    cities.sort()
#     city_index = dict(zip(cities, list(range(n_rows))))
#     print(city_index)
    city_rev_index = dict(zip(list(range(n_rows)), cities))
#     print(city_rev_index)
    city_index = dict([(value, key) for key, value in city_rev_index.items()])
    word_index = dict(zip(all_toks, list(range(len(all_toks)))))
#     print(city_index)
    return city_index, city_rev_index, word_index, city_word_dic

In [44]:
# construct term-doc matrix
def td_matrix(df, city_index, city_rev_index, word_index, city_word_dic):
    num_cities=len(city_rev_index)
    num_words=len(word_index)
    td_matrix = np.zeros(shape=(num_cities, num_words), dtype=np.float16)
    cities = df['City'].tolist()
    for city in cities:
        for word in city_word_dic[city]:
#             print(city, word)
            td_matrix[city_index[city]][word_index[word]] += 1
    td_matrix = ((td_matrix.T+1) / np.linalg.norm(td_matrix+1, axis=1)).T
    return td_matrix

In [45]:
def process_query(query, td_matrix, city_rev_index, tok_method, stemmer, stops, num_results=5):
    query = tokenize_str(query, tok_method, stemmer, stops)
    qvec = np.zeros(td_matrix.shape[1], dtype=np.float16)
    for word in query:
        if word in word_index:
            qvec[word_index[word]] += 1
    qvec = (qvec)/ np.linalg.norm(qvec+1)
    sim = td_mat @ qvec
    top_k = (-sim).argsort()[:num_results]
    top_k = [city_rev_index[k] for k in top_k]
    return top_k 

In [58]:
# construct data
# df = pd.read_csv('api_data_1_2_mil.csv', names=['City', 'Longitude', 'Latitude', 'Ratings', 
#                                          'ObjectNames', 'Description'])
files = ['api_data_usacomp.csv', 'api_data_5+mil.csv']#, 'api_data_2_5_mil.csv', 'api_data_1_2_mil.csv']
#         'api_data_250_500k.csv', 'api_data_100_250k.csv']
df = pd.DataFrame()
for file in files:
    data = pd.read_csv(file, names=['City', 'Longitude', 'Latitude', 'Ratings', 
                                         'ObjectNames', 'Description'])
    df = pd.concat([df, data], axis=0)

df = df[df['Description'] != '[]']
df.reset_index(inplace=True)
stops = set(stopwords.words('english'))
ps = PorterStemmer()

city_index, city_rev_index, word_index, city_word_dic = city_word_indices(df, wordpunct_tokenize, ps, stops)
td_mat = td_matrix(df, city_index, city_rev_index, word_index, city_word_dic)

In [59]:
# driver code
query = input("Type a query: ")
top_5 = process_query(query, td_mat, city_rev_index, wordpunct_tokenize, ps, stops)
print("Your ranked destinations:")
for i in top_5:
    objects_str = df[df['City'] == i].reset_index()['ObjectNames'][0]
    descr_str = df[df['City'] == i].reset_index()['Description'][0]
    ratings_str = df[df['City'] == i].reset_index()['Ratings'][0]
    objects = ast.literal_eval(objects_str)
    descriptions = ast.literal_eval(descr_str)
    ratings = ast.literal_eval(ratings_str)
    print(i, '- Top Attractions:', objects[np.argmax(ratings)])

Type a query: temples
Your ranked destinations:
Pune - Top Attractions: Shaniwar Wada Amphitheatre
Suzhou - Top Attractions: Temple of Confucius
Guangyuan - Top Attractions: Huangze Temple
Foshan - Top Attractions: Foshan Ancestral Temple
Amman - Top Attractions: The Duke's Diwan
