In [58]:
import requests
import json
import nltk
from nltk import wordpunct_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

In [59]:
# if stopwords fails make sure it's been downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
def tokenize_str(string, tok_method, stemmer, stops):
    toks = tok_method(string)
    toks = [w for w in toks if w not in stops]
    toks = [stemmer.stem(w) for w in toks]
    toks = [w.lower() for w in toks if w.isalpha()]
    return toks

In [61]:
def city_word_indices(df, tok_method, stemmer, stops):
    desc = df['Description']
    all_toks = set()
    n_rows = len(desc)
    city_word_dic = {}
    for i in range(n_rows):
        toks = tokenize_str(desc[i], wordpunct_tokenize, stemmer, stops)
        tokset = toks
        all_toks = all_toks.union(tokset)
        city_word_dic[df['City'][i]] = toks
    all_toks = list(all_toks)
    all_toks.sort()
    cities = df['City'].tolist()
    cities.sort()
    city_index = dict(zip(cities, list(range(n_rows))))
    city_rev_index = dict(zip(list(range(n_rows)), cities))
    word_index = dict(zip(all_toks, list(range(len(all_toks)))))
    
    return city_index, city_rev_index, word_index

In [62]:
# construct term-doc matrix
def td_matrix(df, city_index, city_rev_index, word_index, 
              num_cities=len(city_index), num_words=len(word_index)):
    td_matrix = np.zeros(shape=(num_cities, num_words))
    cities = df['City'].tolist()
    for city in cities:
        for word in city_word_dic[city]:
            td_matrix[city_index[city]][word_index[word]] += 1
    td_matrix = (td_matrix.T / np.linalg.norm(td_matrix, axis=1)).T
    return td_matrix

In [63]:
def process_query(query, td_matrix, city_rev_index, tok_method, stemmer, stops, num_results=2):
    query = tokenize_str(query, tok_method, stemmer, stops)
    qvec = np.zeros(num_words)
    for word in query:
        if word in word_index:
            qvec[word_index[word]] += 1
    qvec = qvec / np.linalg.norm(qvec)
    sim = td_mat @ qvec
    top_k = (-sim).argsort()[:num_results]
    top_k = [city_rev_index[k] for k in top_k]
    return top_k 

In [73]:
# metadata
query = input("Type a query: ")
df = pd.read_csv('api_data.csv', names=['City', 'Description'])
stops = set(stopwords.words('english'))
ps = PorterStemmer()

# driver code
city_index, city_rev_index, word_index = city_word_indices(df, wordpunct_tokenize, ps, stops)
td_mat = td_matrix(df, city_index, city_rev_index, word_index)
top_2 = process_query(query, td_mat, city_rev_index, wordpunct_tokenize, ps, stops)
print("Your ranked destinations:")
for i in top_2:
    print(i)

Type a query: coast
Your ranked destinations:
San Jose
Miami
