In [1]:
!conda install -c conda-forge folium=0.5.0 --yes

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [2]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import numpy as np
import json 
import folium
from geopy.geocoders import Nominatim #
from pandas.io.json import json_normalize
from sklearn.cluster import DBSCAN 
from sklearn.datasets.samples_generator import make_blobs 
%matplotlib inline

# read and process data

## neighbourhood postcode and locations

In [3]:
# pass the HTML file 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'lxml')
# get table 
postcodetable = soup.table.text.split('\n')
# read table
postcode = postcodetable[2::5]
borough = postcodetable[3::5]
neighbourhood = postcodetable[4::5]
# create pandas structure 
# define the dataframe columns
column_names = [postcode[0], borough[0], neighbourhood[0]] 

# instantiate the dataframe
postcodes = pd.DataFrame(columns=column_names)

# instantiate the data 
for i in range(len(postcode)-1):
    
# ignore 'not assigned' data
    if borough[i+1] != 'Not assigned':
        postcodes = postcodes.append({'Postcode':postcode[i+1],'Borough': borough[i+1],
                                          'Neighbourhood': neighbourhood[i+1]}, ignore_index=True)
# process data 

# concatenate same postcodes
postcodes = postcodes.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

# process 'not assigned' data
i_nan = np.where(postcodes['Neighbourhood']=='Not assigned')[0]
postcodes['Neighbourhood'][i_nan] = postcodes['Borough'][i_nan]

In [4]:
!wget -q -O 'toronto_data.json' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [5]:
# readdata and build up structure 
with open('toronto_data.json') as json_data:
    a = json_data.read()
    
# create pandas structure 
# define the dataframe columns
column_names = a.split('\n')[0].split(',')

# instantiate the dataframe
positions = pd.DataFrame(columns=column_names)
positions = positions.rename(columns={"Postal Code": "Postcode"})

# instantiate the data 
for i in range(len(a.split('\n'))-1):
    positions = positions.append({'Postcode':a.split('\n')[i+1].split(',')[0],'Latitude': float(a.split('\n')[i+1].split(',')[1]),
                                          'Longitude':float(a.split('\n')[i+1].split(',')[2])}, ignore_index=True)

In [6]:
# merge position information and postcodes
toronto = pd.merge(postcodes, positions)

In [7]:
# only choose the toronto region
#toronto = toronto.iloc[np.where(toronto['Borough'].str.contains('Toronto'))[0]]
#toronto = toronto.reset_index()

## get venue information in foursquare.com

In [9]:
# function getting nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [11]:
# create a new data frame toronto_venues 
toronto_data = toronto
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

In [12]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighbourhood').sum().reset_index()



In [13]:
toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged['Chinese Restaurant']  = toronto_merged['Chinese Restaurant'].fillna(0).astype(int)

# analysis the relationship of occurance of Chinese restaurant and others

## a1: the occruance of Chinese restaurants

In [14]:
toronto_merged['Chinese Restaurant'].max()

4

In [15]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [16]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)

xmax = int(toronto_merged['Chinese Restaurant'].max())
# set color scheme for the clusters
x = np.arange(xmax)
ys = [i + x + (i*x)**2 for i in range(xmax)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, nums in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Chinese Restaurant']):
    label = folium.Popup(str(nums)+ 'Chinese Restaurants in ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[nums-1],
        fill=True,
        fill_color=rainbow[nums-1],
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

We don't want our restaurant to be too close to another chinese restaurant. <br> Here we define the first index: nearby chinese restaurant numbers. The higher index means more chinese restaurant nearby. 

In [36]:
# calculate weighted amount of nearby restaurants
nearby_restaurant = np.zeros(toronto_merged['Latitude'].shape)

for i in range(len(nearby_restaurant)):
    dist = np.sqrt((toronto_merged['Latitude'][i]-toronto_merged['Latitude'])**2+\
                                         (toronto_merged['Longitude'][i]-toronto_merged['Longitude'])**2)
    num_restaurant = toronto_merged['Chinese Restaurant'][np.argsort(dist)]
# divide the neighbourhoods into 10 groups according to their distance to a certain distance, 
# calculate the total amount of Chinese restaurant in this group, multiplies an exponential weight according to the group number 
    weighted_num = np.zeros(10)
    for j in range(10):
        weighted_num[j] = (np.sum(num_restaurant[j*10:(j+1)*10])*(1/(2**j)))
    nearby_restaurant[i] = weighted_num.sum()

In [37]:
toronto_indexs = toronto_data
toronto_indexs['Nearby Restaurant'] = (np.round(nearby_restaurant))
toronto_indexs['Nearby Restaurant'] = toronto_indexs['Nearby Restaurant'].astype(int)

In [38]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)

xmax = int(toronto_indexs['Nearby Restaurant'].max())
# set color scheme for the clusters
x = np.arange(xmax)
ys = [i + x + (i*x)**2 for i in range(xmax)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, nums in zip(toronto_indexs['Latitude'], toronto_indexs['Longitude'], toronto_indexs['Neighbourhood'], toronto_indexs['Nearby Restaurant']):
    label = folium.Popup(str(nums)+ 'Chinese Restaurants in ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[nums-1],
        fill=True,
        fill_color=rainbow[nums-1],
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

In [39]:
# normalize it 
toronto_indexs['Nearby Restaurant'] = 1 - toronto_indexs['Nearby Restaurant']/toronto_indexs['Nearby Restaurant'].max()/1.2

## a2: reachable by more people

More venues near this region means that this restaurant is reachable by more people

In [21]:
toronto_indexs['Venue number'] = toronto_merged.ix[:,'Accessories Store':'Yoga Studio'].sum(axis=1).fillna(0).astype(int)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


In [22]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)

xmax = int(toronto_indexs['Venue number'].max())
# set color scheme for the clusters
x = np.arange(xmax)
ys = [i + x + (i*x)**2 for i in range(xmax)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, nums in zip(toronto_indexs['Latitude'], toronto_indexs['Longitude'], toronto_indexs['Neighbourhood'], toronto_indexs['Venue number']):
    label = folium.Popup(str(nums)+ 'venues in ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[nums-1],
        fill=True,
        fill_color=rainbow[nums-1],
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

In [23]:
# normalize it 
toronto_indexs['Venue number'] = toronto_indexs['Venue number']/toronto_indexs['Venue number'].max()

## a3: not in an isolated neibourhood 

Here we use the DBSCAN method to cluster the toronto region according to their position, for those outliers in the DBSCAN points, it means that they are too isolated and we will not recommend to choose these neighbourhoods. 

In [24]:

epsilon = 0.02
minimumSamples = 3
db = DBSCAN(eps=epsilon, min_samples=minimumSamples).fit(toronto_indexs[['Latitude','Longitude']])

n_clusters_ = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
n_clusters_

4

In [25]:
toronto_indexs['Cluster label'] = db.labels_
#toronto_indexs['Cluster label'] = toronto_indexs['Cluster label'].where(toronto_indexs['Cluster label']!= -1,0)

In [26]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)

xmax = 5
# set color scheme for the clusters
x = np.arange(xmax)
ys = [i + x + (i*x)**2 for i in range(xmax)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, nums in zip(toronto_indexs['Latitude'], toronto_indexs['Longitude'], toronto_indexs['Neighbourhood'], toronto_indexs['Cluster label']):
    label = folium.Popup(str(nums)+ 'venues in ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[nums],
        fill=True,
        fill_color=rainbow[nums],
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

In [27]:
toronto_indexs['Cluster label'] = toronto_indexs['Cluster label'].where(toronto_indexs['Cluster label']<0, 1)
toronto_indexs['Cluster label'] = toronto_indexs['Cluster label'].where(toronto_indexs['Cluster label']>=0, 0.6,)

## a4: number of Chinese immigrate in each borough 

in https://en.wikipedia.org/wiki/Demographics_of_Toronto#Ethnic_diversity, we get to know that the percentage of Chinese Canadians are: 
<br>
Toronto & East York: 9.7%
<br>North York: 13.3%
<br>Scarborough: 19.0%
<br>Mississauga: 7.6%
<br>Etobicoke, York: less than 5.2%
<br>Queen's park: no information

In [28]:
toronto_indexs['Chinese fraction'] = 0

In [29]:
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough'].str.contains('Toronto'))[0]] = 9.7
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='East York')[0]] = 9.7
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='North York')[0]] = 13.3
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='Scarborough')[0]] = 19.0
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='Mississauga')[0]] = 7.6
# we assume the following number
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='Etobicoke')[0]] = 5
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=='York')[0]] = 5
toronto_indexs['Chinese fraction'].iloc[np.where(toronto['Borough']=="Queen's Park")[0]] = 3 

toronto_indexs['Chinese fraction'] = toronto_indexs['Chinese fraction'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [30]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)

xmax = toronto_indexs['Chinese fraction'].max()
# set color scheme for the clusters
x = np.arange(xmax)
ys = [i + x + (i*x)**2 for i in range(xmax)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, nums in zip(toronto_indexs['Latitude'], toronto_indexs['Longitude'], toronto_indexs['Neighbourhood'], toronto_indexs['Chinese fraction']):
    label = folium.Popup(str(nums)+ 'venues in ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[nums-1],
        fill=True,
        fill_color=rainbow[nums-1],
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

In [31]:
# normalize it 
toronto_indexs['Chinese fraction'] = toronto_indexs['Chinese fraction']/toronto_indexs['Chinese fraction'].max()

# Build the model 

let's look at our indexs

In [40]:
toronto_indexs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Nearby Restaurant,Venue number,Cluster label,Chinese fraction,Final index
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.513889,0.03,0.6,0.684211,0.008895
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.652778,0.04,0.6,0.684211,0.01338
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,0.375,0.46,1.0,0.473684,0.137193
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,0.861111,0.1,0.6,0.684211,0.041053
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,0.166667,0.4,1.0,0.157895,0.016374


In [41]:
toronto_indexs['Final index']=toronto_indexs['Nearby Restaurant']*toronto_indexs['Cluster label']*toronto_indexs['Venue number']*toronto_indexs['Chinese fraction']

The following is the top ten neighbourhood that we recommand to open a Chinese restaurant. These neighborhood are almost all in downtown Toronto. In this region, Chinese Canadian fraction is very high, ensuring that the potential costumer is in a large number. Also, venue number in these neighbourhoods is high, and they are close to other neighbourhoods, so enough number of people can live or visit these neighbourhoods. Finally, the number of nearby Chiese restaurant is low is these neighborhoods, so the competition is not that high, and people opening restaurant here can earn more. 

In [47]:
recommendations = toronto_indexs.sort_values('Final index',ascending = False).head(10)
recommendations

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Nearby Restaurant,Venue number,Cluster label,Chinese fraction,Final index
36,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752,0.444444,1.0,1.0,0.473684,0.210526
92,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846,0.444444,0.94,1.0,0.473684,0.197895
42,M5K,Downtown Toronto,"Design Exchange,Toronto Dominion Centre",43.647177,-79.381576,0.375,1.0,1.0,0.473684,0.177632
30,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568,0.375,1.0,1.0,0.473684,0.177632
97,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.38228,0.375,1.0,1.0,0.473684,0.177632
48,M5L,Downtown Toronto,"Commerce Court,Victoria Hotel",43.648198,-79.379817,0.375,1.0,1.0,0.473684,0.177632
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0.375,1.0,1.0,0.473684,0.177632
59,M2N,North York,Willowdale South,43.77012,-79.408493,0.722222,0.35,1.0,0.684211,0.172953
84,M5T,Downtown Toronto,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049,0.305556,1.0,1.0,0.473684,0.144737
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,0.305556,1.0,1.0,0.473684,0.144737


In [43]:
import matplotlib.cm as cm
import matplotlib.colors as colors
map_chineserestaurant = folium.Map(location=[latitude, longitude], zoom_start=11)


# add markers to the map
markers_colors = []
for lat, lon, poi in zip(recommendations['Latitude'], recommendations['Longitude'], recommendations['Neighbourhood']):
    label = folium.Popup('recommandations', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='Blue',
        fill=True,
        fill_color='Blue',
        fill_opacity=0.7).add_to(map_chineserestaurant)
       
map_chineserestaurant

In [44]:
recommendations['Neighbourhood']

36    Harbourfront East,Toronto Islands,Union Station
92                    Stn A PO Boxes 25 The Esplanade
42            Design Exchange,Toronto Dominion Centre
30                             Adelaide,King,Richmond
97              First Canadian Place,Underground city
48                      Commerce Court,Victoria Hotel
15                                     St. James Town
59                                   Willowdale South
84            Chinatown,Grange Park,Kensington Market
9                             Ryerson,Garden District
Name: Neighbourhood, dtype: object