# Format data for app

In [1]:
%matplotlib inline
import rauth
import time
import os 
import os.path 
import pickle as pkl

import seaborn as sns
from seaborn import heatmap
sns.set(style="white", color_codes=True)
import matplotlib.pyplot as plt

import bokeh.plotting as plotting
from bokeh.plotting import figure
from bokeh.charts import Bar, Scatter
from bokeh.charts.attributes import cat,CatAttr
from bokeh.io import gridplot

from bokeh.io import output_file, show,output_notebook,save
from bokeh.models import GMapPlot, GMapOptions, ColumnDataSource, Circle,Patch,Text, Range1d,DataRange1d, PanTool, WheelZoomTool, BoxSelectTool,HoverTool,ResetTool

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from scipy.spatial import ConvexHull

from IPython.display import clear_output

In [2]:
def get_search_parameters(search_term,lat,lng,offset,num_search,search_mode):
    #See the Yelp API for more details
    params = {}
    params["term"] = search_term
    params["sort"]=str(search_mode)
    params["offset"] = str(offset)
    params["ll"] = "{},{}".format(str(lat),str(lng))
    params["radius_filter"] = "20000"
    params["limit"] = str(num_search)

    return params

def get_results(params):

    #Obtain these from Yelp's manage access page
    consumer_key = "YOUR_CONSUMER_KEY"
    consumer_secret = "YOUR_CONSUMER_SECRET"
    token = "YOUR_TOKEN"
    token_secret = "YOUR_TOKEN_SECRET"

    session = rauth.OAuth1Session(
    consumer_key = consumer_key
    ,consumer_secret = consumer_secret
    ,access_token = token
    ,access_token_secret = token_secret)

    request = session.get("http://api.yelp.com/v2/search",params=params)

    #Transforms the JSON API response into a Python dictionary
    data = request.json()
    session.close()

    return data

def run_yelp_search(search_term,fname,locations,max_search,search_mode):
    
    try:
        os.stat(fname)
    except:
        os.mkdir(fname) 
    
    api_calls = []
    offset=0
    num_search=20

    for lat,lng in locations:
        loc_fold=os.path.join(fname,'loc'+str((int(lat*100),int(lng*100))))
        try:
            os.stat(loc_fold)
        except:
            os.mkdir(loc_fold) 
            
        for its in range(max_search/num_search):
            file_name=os.path.join(loc_fold,fname+str(its)+'.pkl')
            if not os.path.exists(file_name):
                params = get_search_parameters(search_term,lat,lng,offset,num_search,search_mode)
                results=get_results(params)
                api_calls.append(results)
                offset=offset+num_search
                #Be a good internet citizen and rate-limit yourself
                print file_name
                pkl.dump(results,open(file_name,'wb'))  
                time.sleep(2.0)
                clear_output()
            else:
                results=pkl.load(open(file_name,'rb'))
                api_calls.append(results)
    return api_calls

In [3]:
# San Diego: Downtown, La Jolla, National City, Miramesa, Kearny Mesa, North Park, Pacific Beach
# Philadelphia: UPenn, City Hall, South Philadelphia, Temple University, Old City, West Philadelphia, Northern Liberties
# New York: Holland Tunnel (not sure), Hell's Kitchen, Queens, Brooklyn, Flatiron, Chelsea, Gramercy, Time Square, Murray Hill
# Chicago: Loop, UChicago, Archer Heights, Belmont, Pilsen, uptown, west loop
# San Francisco: Nob Hill, Noe Valley, Inner Sunset, Inner Richmond, SOMA, Pacific Heights, The Mission

# Lists of lattitudes and longitudes. I've only includes 2 for each city here
city_locs={'San Diego':[(32.7157,-117.1611),(32.8328,-117.2713)],
          'Philadelphia':[(39.9522,-75.1954),(39.9526,-75.1652)],
          'New York':[(40.72512,-74.0113),(40.7612,-73.9887)],
          'Chicago': [(41.8781,-87.6298),(41.7878,-87.6473)],
          'San Francisco':[(37.7823,-122.4219),(37.7504,-122.4304)]}

city_abbreb={'San Diego':'sd',
             'Philadelphia':'ph',
             'New York':'ny',
             'Chicago':'ch',
             'San Francisco':'sf'}

city_zoom={'San Diego':(32.7157,-117.1611),
           'Philadelphia':(39.9526,-75.1652),
           'New York':(40.7612,-73.9887),
          'Chicago':(41.8781,-87.6298),
           'San Francisco':(37.7823,-122.4219)}

cities=city_zoom.keys()

In [4]:
foldname='processed_data'
if not os.path.isdir(foldname):
    os.mkdir(foldname)

# Load data

After I scraped all the data, I stored it all into a dataframe. But the category labels needed some cleaning. I think there's room for improvement here. Apart from pizza, I only used information from the first category label Yelp provided.

In [5]:
all_calls=[]
all_city=[]
for city in city_locs.keys():
    search_term='food'
    max_search=1000
    locations=city_locs[city]
    
    # search be best match
    fname=city_abbreb[city]+'_food_search'
    
    curr_calls=run_yelp_search(search_term,fname,locations,max_search,0)

    # search by distance
    search_term='food'
    fname=city_abbreb[city]+'_food_search_dist'
    max_search=1000
    curr_calls_dist=run_yelp_search(search_term,fname,locations,max_search,1)
    all_calls=all_calls+curr_calls+curr_calls_dist
    all_city=all_city+([city]*len(curr_calls+curr_calls_dist))

In [6]:
def api_call_to_df(food_calls,all_city):
    all_name=[]
    all_rating=[]
    all_lat=[]
    all_lng=[]
    all_cat=[]
    all_cities=[]    
    all_num_rev=[]
    all_price=[]
    for call,city in zip(food_calls,all_city):

        try:
            business_data=call['businesses']
            all_lat=all_lat+[business_data[i]['location']['coordinate']['latitude'] for i in range(len(business_data))]
            all_lng=all_lng+[business_data[i]['location']['coordinate']['longitude'] for i in range(len(business_data))]
            all_name=all_name+[business_data[i]['name'] for i in range(len(business_data))]
            
            # trying to split up pizza
            curr_cat=[]
            for i in range(len(business_data)):
                if (business_data[i]['categories'][0][1]=='pizza'):
                    if len(business_data[i]['categories'])>1:
                        curr_cat.append('pizza_plus')
                    else:
                        curr_cat.append('pizza')
                else:
                    curr_cat.append(business_data[i]['categories'][0][1])
                    
            all_cat=all_cat+curr_cat
                
#            all_cat=all_cat+[business_data[i]['categories'][0][1] for i in range(len(business_data))]
            all_rating=all_rating+[business_data[i]['rating'] for i in range(len(business_data))]
            all_num_rev=all_num_rev+[business_data[i]['review_count'] for i in range(len(business_data))]  
            all_price=all_price+[business_data[i]['review_count'] for i in range(len(business_data))]              
            all_cities=all_cities+([city]*len(business_data))
            
            
        except:
            pass
        
    return pd.DataFrame({'Name':all_name,'Cat':all_cat, 'Rating':all_rating,
                         'Lat':all_lat,'Lng':all_lng,'City':all_cities,'Num_Rev':all_num_rev})

In [7]:
def clean_dataframe(rest_df):

    rest_df.loc[rest_df.Cat=='beer_and_wine','Cat']='beer'
    rest_df.loc[rest_df.Cat=='bars','Cat']='beer'
    rest_df.loc[rest_df.Cat=='beergardens','Cat']='beer'
    rest_df.loc[rest_df.Cat=='breweries','Cat']='beer'
    rest_df.loc[rest_df.Cat=='gastropubs','Cat']='beer'
    rest_df.loc[rest_df.Cat=='divebars','Cat']='beer'  
    rest_df.loc[rest_df.Cat=='pubs','Cat']='beer'
    rest_df.loc[rest_df.Cat=='sportsbars','Cat']='beer' 
    rest_df.loc[rest_df.Cat=='beerbar','Cat']='beer'       
    rest_df.loc[rest_df.Cat=='wine','Cat']='wine'
    rest_df.loc[rest_df.Cat=='wine_bars','Cat']='wine'
    rest_df.loc[rest_df.Cat=='champagne_bars','Cat']='wine'
    rest_df.loc[rest_df.Cat=='cocktailbars','Cat']='cocktails'
    rest_df.loc[rest_df.Cat=='lounges','Cat']='cocktails'

    # Coffee sort of places
    rest_df.loc[rest_df.Cat=='tea','Cat']='cafes'
    rest_df.loc[rest_df.Cat=='coffee','Cat']='cafes'
    rest_df.loc[rest_df.Cat=='bubbletea','Cat']='cafes'    

    # hotdog
    rest_df.loc[rest_df.Cat=='hotdogs','Cat']='hotdog'

    # healthy things...don't yell at me...
    rest_df.loc[rest_df.Cat=='vegan','Cat']='vegetarian'
    rest_df.loc[rest_df.Cat=='juicebars','Cat']='vegetarian'
    rest_df.loc[rest_df.Cat=='organic_stores','Cat']='vegetarian'
    rest_df.loc[rest_df.Cat=='gluten_free','Cat']='vegetarian'  
    rest_df.loc[rest_df.Cat=='raw_food','Cat']='vegetarian'  
    rest_df.loc[rest_df.Cat=='healthmarkets','Cat']='vegetarian'  
    rest_df.loc[rest_df.Cat=='diyfood','Cat']='vegetarian'      

    # cupcakes->bakeries
    rest_df.loc[rest_df.Cat=='cupcakes','Cat']='bakeries'

    # group tapas
    rest_df.loc[rest_df.Cat=='tapasmallplates','Cat']='tapas'

    # ice cream->desserts
    rest_df.loc[rest_df.Cat=='icecream','Cat']='desserts'
    rest_df.loc[rest_df.Cat=='creperies','Cat']='desserts'
    rest_df.loc[rest_df.Cat=='donuts','Cat']='desserts'
    rest_df.loc[rest_df.Cat=='gelato','Cat']='desserts'
    rest_df.loc[rest_df.Cat=='chocolate','Cat']='desserts'    

    # fishnchips->british
    rest_df.loc[rest_df.Cat=='fishnchips','Cat']='british'
    rest_df.loc[rest_df.Cat=='cantonese','Cat']='chinese'  
    rest_df.loc[rest_df.Cat=='shanghainese','Cat']='chinese' 
    rest_df.loc[rest_df.Cat=='dimsum','Cat']='chinese'  
    
    rest_df.loc[rest_df.Cat=='falafel','Cat']='mideastern'
    rest_df.loc[rest_df.Cat=='halal','Cat']='mideastern'   
    rest_df.loc[rest_df.Cat=='chickenshop','Cat']='soulfood' 
    rest_df.loc[rest_df.Cat=='steak','Cat']='newamerican' 
    rest_df.loc[rest_df.Cat=='southern','Cat']='comfortfood'
    rest_df.loc[rest_df.Cat=='fondue','Cat']='newamerican' 
    rest_df.loc[rest_df.Cat=='empanadas','Cat']='argentine' 
    rest_df.loc[rest_df.Cat=='brasseries','Cat']='tradamerican'   
    rest_df.loc[rest_df.Cat=='teppanyaki','Cat']='japanese'    
    rest_df.loc[rest_df.Cat=='sushi','Cat']='japanese' 
    rest_df.loc[rest_df.Cat=='hotpot','Cat']='chinese'           

    # Non-descriptive labels
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='farmersmarket'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='venues'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='convenience'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='restaurants'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='fooddeliveryservices'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='food_court'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='seafoodmarkets'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='catering'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='soup'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='foodtrucks'])    
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='gourmet'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='meats'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='grocery'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='cheese'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='streetvendors'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='movietheaters'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='foodstands'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='buffets'])   
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='banks']) 
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='galleries'])  
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='butcher']) 
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='musicvenues'])  
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='cafeteria'])      
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='tours'])   
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='jazzandblues'])       
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='ethnicmarkets'])   
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='arcades'])       
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='food']) 
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='hotels'])    
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='delis'])
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='hookah_bars'])  
    rest_df=rest_df.drop(rest_df.index[rest_df.Cat=='sandwiches'])  # only used in sf for some reason
    
    return rest_df

In [8]:
all_rest_df=api_call_to_df(all_calls,all_city)
all_rest_df=clean_dataframe(all_rest_df)
all_rest_df=all_rest_df.drop_duplicates(subset=['Name','Lng','Lat']).reset_index()

# Save all data

In [9]:
all_rest_df.Name=all_rest_df.Name.apply(lambda x:x.encode("utf-8")) # accent encoding issue
all_rest_df.to_csv(os.path.join(foldname,'city_all.csv'))

# KL divergence Data

First I'm going to count how many restaurants of each type there are in each city

In [10]:
def get_cat_distr(rest_df):
    rest_df_count=rest_df.groupby('Cat').size().sort_values(ascending=False).reset_index()
    rest_df_rating_mean=rest_df.loc[:,['Cat','Rating']].groupby('Cat').mean().reset_index()
    
    rest_df2=pd.merge(rest_df_count,rest_df_rating_mean,on='Cat')
    rest_df2.columns=['Category','Count','Rating']
    return rest_df2

In [11]:
all_cat_count=all_rest_df.groupby(['City','Cat']).size().reset_index()
all_cat_count.columns=['City','Category','Count']
all_cat_rating=all_rest_df.loc[:,['City','Cat','Rating']].groupby(['City','Cat']).mean().reset_index()
all_cat_rating.columns=['City','Category','Rating']
all_cat_count=pd.merge(all_cat_count,all_cat_rating,on=['City','Category'])

And now I'm going to add entries for restaurant categories that aren't in certain cities

In [12]:
categories=np.unique(all_rest_df.Cat)
all_city_count_df = pd.DataFrame(columns=['City','Category','Count','Rating'])
for key in cities:
    temp=all_cat_count.loc[all_cat_count.City==key,:]
    for cat in categories:
        if sum(np.logical_and(temp.Category==cat,temp.City==key))==0:
            temp=temp.append(pd.DataFrame({'City':[key],'Category':[cat],'Count':[0]}))
    all_city_count_df=all_city_count_df.append(temp)

all_city_count_df.Count=all_city_count_df.Count+1 # Prevent divide by zeros

And finally we can turn this into KL divergence

In [13]:
city_pair_kl = pd.DataFrame(columns=['City_a','City_b','Category','KL Divergence','Rating Difference'])
for a_c in cities:
    for b_c in cities:
        if a_c!=b_c:
            # Calculate proportion of restaurant distribution
            a_subset=all_city_count_df.loc[all_city_count_df.City==a_c,:]
            a_subset.loc[:,'a_Prop']=a_subset.loc[:,'Count']/sum(a_subset.loc[:,'Count'])
            b_subset=all_city_count_df.loc[all_city_count_df.City==b_c,:]            
            b_subset.loc[:,'b_Prop']=b_subset.loc[:,'Count']/sum(b_subset.loc[:,'Count'])

            ab_merge_df=pd.merge(a_subset,b_subset,on=['Category'],suffixes=('_a','_b'))
            ab_merge_df.loc[:,'KL Divergence']=ab_merge_df.a_Prop*np.log10(ab_merge_df.loc[:,'a_Prop']/ab_merge_df.loc[:,'b_Prop'])
            ab_merge_df.loc[:,'Rating Difference']=ab_merge_df.Rating_a-ab_merge_df.Rating_b

            city_pair_kl=city_pair_kl.append(ab_merge_df.loc[:,['City_a','City_b','Category', 'KL Divergence','Rating Difference']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [14]:
city_pair_kl.to_csv(os.path.join(foldname,'city_kl.csv'))

# Making figures!

And now let's do some analyses!

In [15]:
foldname='example_figs'
if not os.path.isdir(foldname):
    os.mkdir(foldname)

## City maps

Where were our restaurants located?

In [16]:
def plot_map_distr(rest_df,city,coords,zoom):
	map_options = GMapOptions(lat=coords[0], lng=coords[1], map_type="roadmap", zoom=zoom)
	plot = GMapPlot(x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options, title="San Diego")


	rest_source = ColumnDataSource(data=dict(lat=rest_df.Lat,lon=rest_df.Lng,Name=rest_df.Name,Rating=rest_df.Rating))
	rest_circle = Circle(x="lon", y="lat", size=5, fill_color='dodgerblue', fill_alpha=0.8, line_color='lightslategrey')
	plot.add_glyph(rest_source, rest_circle)
	hover = HoverTool( tooltips=[("Name", "@Name"),('Rating','@Rating')])
		
	plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool(),ResetTool(),hover)
	return plot

In [17]:
city='San Francisco'
plot=plot_map_distr(all_rest_df,city,city_zoom[city],zoom=11)
fname=city+'_map'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(plot)

## City distributions

What was the distribution of restaurant categories within each city?

In [18]:
def plot_category_distr(rest_df,city):
    rest_df=rest_df.loc[rest_df.City==city,:]
    rest_df=get_cat_distr(rest_df)

    plot=Bar(rest_df,label=CatAttr(columns=['Category'], sort=False),values='Count',plot_width=1200, plot_height=500)
    return plot

In [19]:
city='San Francisco'
plot=plot_category_distr(all_rest_df,city)
fname=city+'_distribution'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(plot)

# Kullback-Leibler convergence-Aggregate

How different are the other cities?

In [20]:
def plot_similar_city(kl_df,visitor):
	mean_visited=kl_df.loc[kl_df.loc[:,'City_b']==visitor,['City_a','KL Divergence']].groupby('City_a').sum().reset_index()
	mean_visited=mean_visited.sort_values(by='KL Divergence',ascending=False)
	mean_visited.columns=['City','KL Divergence']
	plot=Bar(mean_visited,label=CatAttr(columns=['City'], sort=False),values='KL Divergence',plot_width=800, plot_height=300)
	
	return plot

In [21]:
visitor='San Diego'
plot=plot_similar_city(city_pair_kl,visitor)
fname=city+'_kl_aggr'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(plot)

# Kullback-Leibler convergence-By category

What categories do restaurants vary in?

In [22]:
def plot_frequent_city(kl_df,visitor,visited):
    sel_city_pair=np.logical_and(kl_df.City_a==visited, kl_df.City_b==visitor)
    curr_city=kl_df.loc[sel_city_pair,:]
    curr_city=curr_city.sort_values(by='KL Divergence',ascending=False)
    plot=Bar(curr_city,label=CatAttr(columns=['Category'], sort=False),values='KL Divergence',plot_width=1200, plot_height=600,title='From '+visitor+' To '+visited)
    return plot

In [23]:
visitor='San Diego'
visited='New York'
plot=plot_frequent_city(city_pair_kl,visitor,visited)
fname=visitor+'_'+visited+'_kl_ind'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(plot)

# Novel restaurant map

Where should you go if you want novel foods?

In [24]:
two_pt_five_in_degrees=(float(1)/70)*2.5
two_in_degrees=(float(1)/70)*2
one_pt_five_in_degrees=(float(1)/70)*1.5
mile_in_degrees=(float(1)/70)
threequarter_in_degrees=(float(1)/70)/(float(4)/3)
twothird_in_degrees=(float(1)/70)/1.5
half_mile_in_degrees=(float(1)/70)/2
quarter_mile_in_degrees=(float(1)/70)/4

eps_range={'San Francisco':{'gen':quarter_mile_in_degrees,'clus':threequarter_in_degrees},'San Diego':{'gen':twothird_in_degrees,'clus':mile_in_degrees},'Philadelphia':{'gen':twothird_in_degrees,'clus':two_pt_five_in_degrees},'New York':{'gen':half_mile_in_degrees,'clus':mile_in_degrees},'Chicago':{'gen':quarter_mile_in_degrees,'clus':mile_in_degrees}}

In [25]:
def cluster_locations(df,eps,min_s):
    km_clus=DBSCAN(eps=eps,min_samples=min_s)
    labels=km_clus.fit_predict(df.loc[:,['Lat','Lng']])
    return labels
    
def rest_rank(df,cat_name):
    df=df.sort_values(by=['Num_Rev','Rating'],ascending=False)
    p = figure(toolbar_location=None)
    p.set(x_range=Range1d(-.2, 4), y_range=Range1d(-11,2))

    y=0

    # Category name
    p.text(0, y, text=[cat_name],text_color="darkred", text_align="left", text_font_size=str(32)+"pt",text_font='impact')

    t_size=15
    count=0
    for i,row in df.iterrows():
        y=y-2
        name=row.loc['Name']+', '+str(row.loc['Rating'])
        p.text(0, y, text=[name],text_color="indianred", text_align="left", text_font_size=str(t_size)+"pt",text_font='Arial Black')
        name2='# Reviews: '+str(row.loc['Num_Rev'])
        p.text(0, y-.5, text=[name2],text_color="indianred", text_align="left", text_font_size="14pt",text_font='Arial Black')

    # 		t_size=t_size-2
        count=count+1
        if count>4:
            break
    p.axis.visible=None
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p


def plot_map_clusters(city_visited,city_visitor,city_pair_kl,all_rest_df,eps_gen,eps_clus,num_freq,coords):
    curr_city=all_rest_df.loc[all_rest_df.loc[:,'City']==city_visited,:]

    def match_cat(row):
        if row.Cat in novel_cats2:
            return True
        else:
            return False

    # Center map on restaurants in category
    i=num_freq
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)

    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=False).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()#[novel_cats.Category]
    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[cat_match,:]

    clus_lat=subset_rest_df.loc[:,'Lat'].mean()
    clus_lng=subset_rest_df.loc[:,'Lng'].mean()

    # actual map
    map_options = GMapOptions(lat=clus_lat, lng=clus_lng, map_type="roadmap", zoom=11)
    plot = GMapPlot(x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options)

    # Identify novel restaurants so we can remove them from the general clustering
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)

    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=False).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()

    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[np.logical_not(cat_match),:]

    # Identify clusters
    subset_rest_df.loc[:,'Clus']=cluster_locations(subset_rest_df,eps=eps_gen,min_s=10)


    # Turn general clusters into convex hull and add patches to map
    curr_clus=np.unique(subset_rest_df.loc[:,'Clus'])
    curr_clus=curr_clus[curr_clus>=0]
    for cp in curr_clus:
        subset_rest_df2=subset_rest_df.loc[subset_rest_df.loc[:,'Clus']==cp,:]
        try:
            hull_points=ConvexHull(subset_rest_df2.loc[:,['Lat','Lng']]).vertices
            hull_locs=subset_rest_df2.iloc[hull_points,:].loc[:,['Lat','Lng']]
            cp_source=ColumnDataSource(data=dict(lat=hull_locs.loc[:,'Lat'],lon=hull_locs.loc[:,'Lng']))
            cp_patch=Patch(x="lon",y="lat", fill_color='dodgerblue', line_color='lightslategrey',fill_alpha=.2)
            plot.add_glyph(cp_source, cp_patch)      
        except:
            pass

    # For given categories, cluster and plot
    colors=['indianred','mediumseagreen','gold']
    i=num_freq
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)
    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=False).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()#[novel_cats.Category]
    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[cat_match,:]

    # Identify clusters
    subset_rest_df.loc[:,'Clus']=cluster_locations(subset_rest_df,eps=eps_clus,min_s=3)

    # Turn novel clusters into convex hull and add patches to map
    curr_clus=np.unique(subset_rest_df.loc[:,'Clus'])
    curr_clus=curr_clus[curr_clus>=0]

    for cp in curr_clus:
        subset_rest_df2=subset_rest_df.loc[subset_rest_df.loc[:,'Clus']==cp,:]

        cat_props=subset_rest_df2.groupby('Cat').size().reset_index()
        cat_props.columns=['Category','Number']
        try:
            hull_points=ConvexHull(subset_rest_df2.loc[:,['Lat','Lng']]).vertices
            hull_locs=subset_rest_df2.iloc[hull_points,:].loc[:,['Lat','Lng']]
            cp_source=ColumnDataSource(data=dict(lat=hull_locs.loc[:,'Lat'],lon=hull_locs.loc[:,'Lng']))
            cp_patch=Patch(x="lon",y="lat", fill_color='indianred', line_color='lightslategrey',fill_alpha=.5)
            plot.add_glyph(cp_source, cp_patch)
        except:
            pass  

    subset_rest_df_rank=subset_rest_df.sort_values(by=['Num_Rev','Rating'],ascending=False).iloc[0:5,:]
    rest_source = ColumnDataSource(data=dict(lat=subset_rest_df_rank.loc[:,'Lat'],lon=subset_rest_df_rank.loc[:,'Lng'],Name=subset_rest_df_rank.loc[:,'Name'],Rating=subset_rest_df_rank.loc[:,'Rating']))
    rest_circle = Circle(x="lon", y="lat", size=10, fill_color=colors[0], fill_alpha=0.8, line_color='lightslategrey')
    plot.add_glyph(rest_source, rest_circle)
    hover = HoverTool( tooltips=[("Name", "@Name"),('Rating','@Rating')])

    plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool(),ResetTool(),hover)

    # Plot text rank
    text_plot=rest_rank(subset_rest_df,novel_cats2[0].upper())

    return plot,text_plot 

In [26]:
visitor='San Diego'
visited='New York'
vis_plot=[]
for freq in range(0,5):
    curr_plot,curr_text=plot_map_clusters(visited,visitor,city_pair_kl,all_rest_df,eps_gen=eps_range[visited]['gen'],eps_clus=eps_range[visited]['clus'],num_freq=[freq],coords=city_locs[visited])
    vis_plot.append([curr_plot,curr_text])
vis_plot=gridplot(vis_plot)

fname=visitor+'_'+visited+'_diff_map'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(vis_plot)

# Similar restaurant maps

Where should you go if you want food that's similar to home?

In [27]:
def plot_same_map_clusters(city_visited,city_visitor,city_pair_kl,all_rest_df,eps_gen,eps_clus,num_freq,coords):
    curr_city=all_rest_df.loc[all_rest_df.loc[:,'City']==city_visited,:]

    def match_cat(row):
        if row.Cat in novel_cats2:
            return True
        else:
            return False

    # Center map on restaurants in category
    i=num_freq
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)

    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=True).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()#[novel_cats.Category]
    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[cat_match,:]

    clus_lat=subset_rest_df.loc[:,'Lat'].mean()
    clus_lng=subset_rest_df.loc[:,'Lng'].mean()

    # actual map
    map_options = GMapOptions(lat=clus_lat, lng=clus_lng, map_type="roadmap", zoom=11)
    plot = GMapPlot(x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options)

    # Identify novel restaurants so we can remove them from the general clustering
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)

    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=True).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()

    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[np.logical_not(cat_match),:]

    # Identify clusters
    subset_rest_df.loc[:,'Clus']=cluster_locations(subset_rest_df,eps=eps_gen,min_s=10)


    # Turn general clusters into convex hull and add patches to map
    curr_clus=np.unique(subset_rest_df.loc[:,'Clus'])
    curr_clus=curr_clus[curr_clus>=0]
    for cp in curr_clus:
        subset_rest_df2=subset_rest_df.loc[subset_rest_df.loc[:,'Clus']==cp,:]
        try:
            hull_points=ConvexHull(subset_rest_df2.loc[:,['Lat','Lng']]).vertices
            hull_locs=subset_rest_df2.iloc[hull_points,:].loc[:,['Lat','Lng']]
            cp_source=ColumnDataSource(data=dict(lat=hull_locs.loc[:,'Lat'],lon=hull_locs.loc[:,'Lng']))
            cp_patch=Patch(x="lon",y="lat", fill_color='dodgerblue', line_color='lightslategrey',fill_alpha=.2)
            plot.add_glyph(cp_source, cp_patch)      
        except:
            pass

    # For given categories, cluster and plot
    colors=['indianred','mediumseagreen','gold']
    i=num_freq
    sel_ind_cat=np.logical_and(city_pair_kl.loc[:,'City_a']==city_visited, city_pair_kl.loc[:,'City_b']==city_visitor)
    novel_cats=city_pair_kl.loc[sel_ind_cat,:].sort_values(by='KL Divergence',ascending=True).iloc[i,:]
    novel_cats2=novel_cats.loc[:,'Category'].tolist()#[novel_cats.Category]
    cat_match=curr_city.apply(match_cat,axis=1)
    subset_rest_df=curr_city.loc[cat_match,:]  

    subset_rest_df_rank=subset_rest_df.sort_values(by=['Num_Rev','Rating'],ascending=False).iloc[0:20,:]
    rest_source = ColumnDataSource(data=dict(lat=subset_rest_df_rank.loc[:,'Lat'],lon=subset_rest_df_rank.loc[:,'Lng'],Name=subset_rest_df_rank.loc[:,'Name'],Rating=subset_rest_df_rank.loc[:,'Rating']))
    rest_circle = Circle(x="lon", y="lat", size=5, fill_color=colors[0], fill_alpha=0.8, line_color='lightslategrey')
    plot.add_glyph(rest_source, rest_circle)
    hover = HoverTool( tooltips=[("Name", "@Name"),('Rating','@Rating')])

    plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool(),ResetTool(),hover)


    # Plot text rank
    text_plot=rest_rank(subset_rest_df,novel_cats2[0].upper())

    return plot,text_plot 

In [28]:
visitor='San Diego'
visited='New York'
vis_plot=[]
for freq in range(0,5):
    curr_plot,curr_text=plot_same_map_clusters(visited,visitor,city_pair_kl,all_rest_df,eps_gen=eps_range[visited]['gen'],eps_clus=eps_range[visited]['clus'],num_freq=[freq],coords=city_locs[visited])
    vis_plot.append([curr_plot,curr_text])
vis_plot=gridplot(vis_plot)

fname=visitor+'_'+visited+'_same_map'+'.html'
full_name=os.path.join(foldname,fname)
output_file(full_name)
show(vis_plot)