# Part 1 - Neighbourhoods per Postal Code

Import libraries

In [112]:
! pip install lxml
import lxml
import requests
import pandas as pd



Get Toronto postal code page from Wikipedia and make a dataframe

In [113]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(url)
df=pd.read_html(res.text)[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Remove the 'Not assigned' rows

In [114]:
not_assigned_rows=df[(df['Neighbourhood'] == 'Not assigned')].index
df.drop(not_assigned_rows, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check if there are still 'Not assigned' values anywhere in the dataframe.

In [115]:
'Not assigned' in df.values

False

Sort out by Postal Code

In [116]:
df.sort_values(by=['Postal Code'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


The size of the cleaned up dataframe is:

In [117]:
df.shape

(103, 3)




____________________________________________________________________________
# Part 2 - Neighbourhoods with Geo Coordinates


Import and process geospatial coordinates for Toronto postal code areas 

In [118]:
df_coor = pd.read_csv('../data/coordinates_Toronto.csv')
df_coor.sort_values(by=['Postal Code'], inplace=True)
df.reset_index(drop=True)
df_coor.shape

(103, 3)

Update neighbourhood dataframe with the coordinates

In [119]:
df['Latitude']=df_coor['Latitude']
df['Longitude']=df_coor['Longitude']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Let's concentrate only on boroughs that comprise the original Toronto excluding areas that were added as the result of the 1998 amalgamation (i.e. Scarborough, North York, etc.)<br/>Such boroughs all have 'Toronto' in their names.  

In [120]:
df_tor = df[df['Borough'].str.contains('Toronto')]
df_tor.reset_index(drop=True, inplace=True)
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [121]:
df_tor.shape

(39, 5)

In [122]:
print('The final total number of boroughs is {}.'.format(df_tor.shape[0]))

The final total number of boroughs is 39.


____________________________________________________________________________
# Part 3 - Clustering and Geographic Visualisation

Combine coordinates into a single column

In [123]:
# Note: using the code below instead of zip(df_tor['Latitude'], df_tor['Longitude']) as it is a better practice not producing a warning.
cols=list(df_tor.columns)
cols.append('Coordinates')
df_tor=df_tor.reindex(columns=cols)
df_tor['Coordinates']=list(zip(df_tor.loc[:,'Latitude'], df_tor.loc[:,'Longitude']))
df_tor.drop(['Latitude', 'Longitude'], axis=1, inplace=True)
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Coordinates
0,M4E,East Toronto,The Beaches,"(43.67635739999999, -79.2930312)"
1,M4K,East Toronto,"The Danforth West, Riverdale","(43.6795571, -79.352188)"
2,M4L,East Toronto,"India Bazaar, The Beaches West","(43.6689985, -79.31557159999998)"
3,M4M,East Toronto,Studio District,"(43.6595255, -79.340923)"
4,M4N,Central Toronto,Lawrence Park,"(43.7280205, -79.3887901)"


Let's now visualize the neighborhoods on the map. 

In [124]:
import folium
# Toronto coordinates centered on the City Hall
coords_toronto = (43.653200, -79.383200)
map_toronto = folium.Map(location=coords_toronto, zoom_start=10)

for coordinates, neighborhood, postal in zip(df_tor['Coordinates'], df_tor['Neighbourhood'], df_tor['Postal Code']):
    label = '{}, {}'.format(neighborhood, postal)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        coordinates,
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Now we are going to retrieve data from Foursquare and save it in a dataframe

In [125]:
# get Foursquare credentials
import json
with open('../data/creds.json') as f:
    data=f.read()
creds=json.loads(data)

In [126]:
# This function polls Foursquare and returns types of venues in each location
def get_venues(creds, coords, radius=500):
    def get_placetypes(res):
        placetypes = []
        for each in res['response']['groups'][0]['items']:
            placetypes.append(each['venue']['categories'][0]['shortName'])
        return placetypes
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        creds['client_id'], 
        creds['client_secret'], 
        '20180605', 
        coords[0], 
        coords[1], 
        radius, 
        100)  
    return get_placetypes(requests.get(url).json())

In [127]:
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Coordinates
0,M4E,East Toronto,The Beaches,"(43.67635739999999, -79.2930312)"
1,M4K,East Toronto,"The Danforth West, Riverdale","(43.6795571, -79.352188)"
2,M4L,East Toronto,"India Bazaar, The Beaches West","(43.6689985, -79.31557159999998)"
3,M4M,East Toronto,Studio District,"(43.6595255, -79.340923)"
4,M4N,Central Toronto,Lawrence Park,"(43.7280205, -79.3887901)"


In [128]:
listings = {}
neighbourhoods = [i for i in df_tor['Neighbourhood'].values]
for each in neighbourhoods:
    listings.update({each: get_venues(creds, df_tor.loc[df_tor['Neighbourhood']==each]['Coordinates'].values[0])})
listings

{'The Beaches': ['Trail', 'Health Food Store', 'Pub', 'Neighborhood'],
 'The Danforth West, Riverdale': ['Cosmetics',
  'Greek',
  'Italian',
  'Ice Cream',
  'Ice Cream',
  'Brewery',
  'Greek',
  'Yoga Studio',
  'Fruit & Vegetable Store',
  'Greek',
  'Italian',
  'Greek',
  'Pub',
  'Restaurant',
  'Pizza',
  'Juice Bar',
  'Bookstore',
  'Greek',
  'Trail',
  'Furniture / Home',
  'Greek',
  'Desserts',
  'Greek',
  'Bubble Tea',
  'Spa',
  'Grocery Store',
  'Restaurant',
  'Coffee Shop',
  'Tibetan',
  'Bakery',
  'Indian',
  'Coffee Shop',
  'Caribbean',
  'Café',
  'Coffee Shop',
  'Lounge',
  'Yogurt',
  'Italian',
  'Liquor Store',
  'American',
  'Furniture / Home',
  'Coffee Shop'],
 'India Bazaar, The Beaches West': ['Gym',
  'Fish & Chips',
  'Fast Food',
  'Ice Cream',
  'Sushi',
  'Park',
  'Brewery',
  'Liquor Store',
  'Italian',
  'Pet Store',
  'Pub',
  'Steakhouse',
  'Movie Theater',
  'Restaurant',
  'Sandwiches',
  'Fast Food',
  'Board Shop',
  'Pizza'],
 'Stu

In [129]:
venuetypes = []
for neighb in listings:
    for venue in listings[neighb]:
        if not venue in venuetypes:
            venuetypes.append(venue)
venuetypes.sort()

In [190]:
df_counts = df_tor.copy()
df_counts.shape

(39, 4)

In [191]:
for i in range(0, len(venuetypes)):
    df_counts.insert(i+4, venuetypes[i], 0)
df_counts.shape

(39, 240)

In [192]:
df_counts.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Coordinates,Adult Boutique,Airport,Airport Service,American,Antiques,Apparel,...,Trail,Train Station,Travel,Vegetarian / Vegan,Video Games,Vietnamese,Wine Bar,Wine Shop,Yoga Studio,Yogurt
0,M4E,East Toronto,The Beaches,"(43.67635739999999, -79.2930312)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4K,East Toronto,"The Danforth West, Riverdale","(43.6795571, -79.352188)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4L,East Toronto,"India Bazaar, The Beaches West","(43.6689985, -79.31557159999998)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4M,East Toronto,Studio District,"(43.6595255, -79.340923)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4N,Central Toronto,Lawrence Park,"(43.7280205, -79.3887901)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We will now populate the dataframe with counts of each type of venue in each neighborhood  

In [193]:
for area in listings:
    for venue in listings[area]:
        col_lookup_label='Neighbourhood'
        col_lookup_val=area
        col_label=venue
        df_counts.loc[df_counts[col_lookup_label]==col_lookup_val, col_label] += 1

In [194]:
df_counts

Unnamed: 0,Postal Code,Borough,Neighbourhood,Coordinates,Adult Boutique,Airport,Airport Service,American,Antiques,Apparel,...,Trail,Train Station,Travel,Vegetarian / Vegan,Video Games,Vietnamese,Wine Bar,Wine Shop,Yoga Studio,Yogurt
0,M4E,East Toronto,The Beaches,"(43.67635739999999, -79.2930312)",0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4K,East Toronto,"The Danforth West, Riverdale","(43.6795571, -79.352188)",0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,1
2,M4L,East Toronto,"India Bazaar, The Beaches West","(43.6689985, -79.31557159999998)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4M,East Toronto,Studio District,"(43.6595255, -79.340923)",0,0,0,2,0,1,...,0,0,0,0,0,0,0,0,1,0
4,M4N,Central Toronto,Lawrence Park,"(43.7280205, -79.3887901)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M4P,Central Toronto,Davisville North,"(43.7127511, -79.3901975)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,M4R,Central Toronto,"North Toronto West, Lawrence Park","(43.7153834, -79.40567840000001)",0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,1,0
7,M4S,Central Toronto,Davisville,"(43.7043244, -79.3887901)",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M4T,Central Toronto,"Moore Park, Summerhill East","(43.6895743, -79.38315990000001)",0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...","(43.68641229999999, -79.4000493)",0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


Let's double-check the counts by running the following test 

In [276]:
for idx in range(len(df_counts.index)):
    venues = listings[df_counts.loc[idx]['Neighbourhood']]
    for venue in venues:
        assert df_counts.loc[idx, venue] == venues.count(venue)

In [323]:
vtype = 'Airport'


In [324]:
df_counts.loc[df_counts[vtype]==max(df_counts[vtype])]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Coordinates,Adult Boutique,Airport,Airport Service,American,Antiques,Apparel,...,Trail,Train Station,Travel,Vegetarian / Vegan,Video Games,Vietnamese,Wine Bar,Wine Shop,Yoga Studio,Yogurt
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel","(43.6481985, -79.37981690000001)",0,0,0,4,0,0,...,0,0,0,2,0,0,1,0,0,0


In [325]:
df_venue = df_counts.loc[df_counts[vtype]>0, ['Neighbourhood', 'Coordinates', vtype]]
df_venue

Unnamed: 0,Neighbourhood,Coordinates,American
1,"The Danforth West, Riverdale","(43.6795571, -79.352188)",1
3,Studio District,"(43.6595255, -79.340923)",2
9,"Summerhill West, Rathnelly, South Hill, Forest...","(43.68641229999999, -79.4000493)",1
12,Church and Wellesley,"(43.6658599, -79.38315990000001)",1
15,St. James Town,"(43.6514939, -79.3754179)",3
18,"Richmond, Adelaide, King","(43.65057120000001, -79.3845675)",2
20,"Toronto Dominion Centre, Design Exchange","(43.6471768, -79.38157640000001)",3
21,"Commerce Court, Victoria Hotel","(43.6481985, -79.37981690000001)",4
28,Stn A PO Boxes,"(43.6464352, -79.37484599999999)",1
29,"First Canadian Place, Underground city","(43.6484292, -79.3822802)",3


In [326]:
map_tor = folium.Map(location=coords_toronto, zoom_start=10)

# all locations
for coordinates, neighborhood, postal in zip(df_tor['Coordinates'], df_tor['Neighbourhood'], df_tor['Postal Code']):
    label = '{}, {}'.format(neighborhood, postal)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        coordinates,
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  

# locations with specified venue type
for coordinates, neighborhood, postal in zip(df_venue['Coordinates'], df_venue['Neighbourhood'], df_venue[vtype]):
    label = '{}, {}'.format(neighborhood, postal)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        coordinates,
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  

# location with largest number of specified venues 
folium.CircleMarker(
        df_venue.loc[df_venue[vtype]==max(df_venue[vtype]), 'Coordinates'].values[0],
        radius=5,
        popup='BEST!',
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)

map_tor

In [310]:
vtypes=list(df_counts.columns)
vtypes.remove('Postal Code')
vtypes.remove('Borough')
vtypes.remove('Coordinates')
vtypes.remove('Neighbourhood')
vtypes

['Adult Boutique',
 'Airport',
 'Airport Service',
 'American',
 'Antiques',
 'Apparel',
 'Aquarium',
 'Art Gallery',
 'Art Museum',
 'Arts',
 'Arts & Crafts',
 'Asian',
 'Athletics & Sports',
 'Auditorium',
 'Auto Workshop',
 'B & B',
 'BBQ',
 'Baby Store',
 'Bagels',
 'Bakery',
 'Bank',
 'Bar',
 'Baseball',
 'Basketball',
 'Beach',
 'Beer Bar',
 'Beer Store',
 'Belgian',
 'Bistro',
 'Board Shop',
 'Boat / Ferry',
 'Bookstore',
 'Boutique',
 'Brazilian',
 'Breakfast',
 'Brewery',
 'Bubble Tea',
 'Building',
 'Burgers',
 'Burritos',
 'Bus',
 'Business Services',
 'Butcher',
 'Cable Car',
 'Cafeteria',
 'Café',
 'Cajun / Creole',
 'Candy Store',
 'Caribbean',
 'Cheese Shop',
 'Chinese',
 'Chocolate Shop',
 'Church',
 'Climbing Gym',
 'Cocktail',
 'Coffee Shop',
 'Colombian',
 'Comfort Food',
 'Comic Shop',
 'Concert Hall',
 'Convenience Store',
 'Cosmetics',
 'Coworking Space',
 'Creperie',
 'Cuban',
 'Cupcakes',
 'Dance Studio',
 'Deli / Bodega',
 'Department Store',
 'Desserts',
 'Din