# **Recreating Part 1 Dataframe**

## Importing necessary libraries

In [210]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

## Extracting the html file from the URL

In [211]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url).text

## Using BeautifulSoup to find the table rows containing the postcode info

In [212]:
soup = BeautifulSoup(r,'lxml')
btable = soup.find('table')
tab_rows = btable.find_all('tr')

## Creating the pandas dataframe and populating it with info from the html 
### (with processing of _Not assigned_ and duplicate postal code rows)

In [213]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']  # Setting the columns for the new pandas dataframe
postcodes = pd.DataFrame(columns=column_names)

for n in range(1,len(tab_rows)):
    row_t = tab_rows[n].text.split('\n')
    if row_t[2] != 'Not assigned':  # Skip rows with 'Not assigned' Boroughs
        if row_t[3] == 'Not assigned':  # Copying Borough name to 'Not assigned' Neighborhoods
            row_t[3] = row_t[2]
        if row_t[1] in postcodes.loc[:,'PostalCode'].unique():  # Appending the Neighborhood names to duplicate PostalCode areas
            postcodes.loc[tag,'Neighborhood'] = postcodes.loc[tag,'Neighborhood'] + ', ' + row_t[3]
        else: 
            postcodes.loc[n,:] = row_t[1:4]
            tag = n

postcodes.reset_index(drop=True,inplace=True)
postcodes

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


## Print the number of rows of the dataframe

In [214]:
print('The shape of the dataframe: ',postcodes.shape)

The shape of the dataframe:  (103, 3)


# **Recreating Part 2**

## Downloading csv file that has the geographical coordinates of each postal code 
## *(Geocoder package failed to work)*

In [215]:
import csv

!wget -q -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data
print('Data downloaded!')

# read csv file
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.head()

Data downloaded!


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Match the Postal Codes to the coordinate values to populate the dataframe

In [216]:
for i,n in enumerate(postcodes['PostalCode']):
    res = coord[coord['Postal Code']==n]
    postcodes.loc[i,'Latitude'] = res.Latitude.values
    postcodes.loc[i,'Longitude'] = res.Longitude.values

postcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# **Part 3**

## Import relevant libraries

In [217]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
import numpy as np
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


## Define Foursquare credentials and version

In [218]:
CLIENT_ID = 'QSWCVB3IUNTUBZTDNQE00EYS44ZNHES3IDLNPJIQ2PY5FX4U' # your Foursquare ID
CLIENT_SECRET = 'MGVRCCQSNGLQUIEAS45KHJVWKOW2TIOK14RBYCP20GHHNC24' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QSWCVB3IUNTUBZTDNQE00EYS44ZNHES3IDLNPJIQ2PY5FX4U
CLIENT_SECRET:MGVRCCQSNGLQUIEAS45KHJVWKOW2TIOK14RBYCP20GHHNC24


## Define getNearbyVenues function for extracting venues

In [219]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Select a subset of neighborhoods and explore the nearby venues for each

In [220]:
toronto_data = postcodes[6:91].reset_index(drop=True)
toronto_venues = getNearbyVenues(toronto_data['Neighborhood'],toronto_data["Latitude"],toronto_data['Longitude'])

## Create a one hot encoding dataframe for the neighborhood's venues

In [221]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
#toronto_onehot.columns = ['Neighborhood'].extend(toronto_onehot.columns)
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Group the one hot encoded dataframe by Neighborhood to obtain frequencies for each venue

In [222]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Define the function for extracting the most common venues

In [223]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Build the top 5 most common venues for each neighborhood dataframe

In [224]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,Steakhouse,American Restaurant
1,Agincourt,Lounge,Skating Rink,Breakfast Spot,Sandwich Place,Yoga Studio
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Yoga Studio,Donut Shop,Dim Sum Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Fried Chicken Joint,Coffee Shop,Pharmacy,Pizza Place
4,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Shopping Mall,Pizza Place,Sushi Restaurant,Middle Eastern Restaurant


## Cluster the neighborhoods based on the common venues (_by using k-means clustering where k=5_)

In [225]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 4, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

## Create the dataframe for displaying the cluster which each neighborhood belongs to

In [226]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,3,Fast Food Restaurant,Yoga Studio,Falafel Restaurant,Event Space,Ethiopian Restaurant
1,M3B,North York,Don Mills North,43.745906,-79.352188,3,Basketball Court,Gym / Fitness Center,Café,Caribbean Restaurant,Baseball Field
2,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4,Fast Food Restaurant,Pizza Place,Gastropub,Pharmacy,Rock Climbing Spot
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,3,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant
4,M6B,North York,Glencairn,43.709577,-79.445073,3,Italian Restaurant,Japanese Restaurant,Pub,Bakery,Doner Restaurant


## Generate Folium map (assuming Toronto city hall as the centre of Toronto)

In [227]:
# Assuming Toronto coordinates as 43.653170, -79.383541
tor_latitude = 43.653170
tor_longitude = -79.383541

# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1
### There are many restaurants and eateries in this region

In [228]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
49,North York,0,Coffee Shop,Italian Restaurant,Fast Food Restaurant,Hardware Store,Restaurant


### Cluster 2 
### These regions are good for exercising (Yoga studios, parks) and grabbing a quick bite (coffee shops, sandwich place, restaurants)

In [229]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
11,Etobicoke,1,Park,Café,Pharmacy,Beer Store,Pizza Place
13,East Toronto,1,Coffee Shop,Other Great Outdoors,Pub,Yoga Studio,Donut Shop
34,North York,1,Bus Stop,Airport,Park,Electronics Store,Donut Shop
36,Downtown Toronto,1,Coffee Shop,Café,Hotel,Restaurant,American Restaurant
40,North York,1,Park,Grocery Store,Bank,Shopping Mall,Donut Shop
50,York,1,Check Cashing Service,Discount Store,Sandwich Place,Turkish Restaurant,Department Store
51,North York,1,Baseball Field,Yoga Studio,Dim Sum Restaurant,Farmers Market,Falafel Restaurant
53,North York,1,Coffee Shop,Restaurant,Ramen Restaurant,Café,Sandwich Place
56,Central Toronto,1,Garden,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner
58,York,1,Convenience Store,Yoga Studio,Dessert Shop,Falafel Restaurant,Event Space


### Cluster 3 
### Exotic cuisines can be found here (Mexican and Thai food)

In [230]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
63,West Toronto,2,Café,Mexican Restaurant,Bar,Grocery Store,Thai Restaurant


### Cluster 4 
### Coffee shops and cafes are frequented in this regions

In [231]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Scarborough,3,Fast Food Restaurant,Yoga Studio,Falafel Restaurant,Event Space,Ethiopian Restaurant
1,North York,3,Basketball Court,Gym / Fitness Center,Café,Caribbean Restaurant,Baseball Field
3,Downtown Toronto,3,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant
4,North York,3,Italian Restaurant,Japanese Restaurant,Pub,Bakery,Doner Restaurant
5,Etobicoke,3,Golf Course,Bank,Dumpling Restaurant,Diner,Discount Store
6,Scarborough,3,Bar,Moving Target,Yoga Studio,Diner,Discount Store
7,North York,3,Gym,Coffee Shop,Asian Restaurant,Beer Store,Storage Facility
8,East York,3,Park,Cosmetics Shop,Skating Rink,Beer Store,Spa
9,Downtown Toronto,3,Coffee Shop,Café,Restaurant,Hotel,Gastropub
10,York,3,Park,Field,Hockey Arena,Trail,Yoga Studio


### Cluster 5
### Sporting needs can be met here (Sporting goods shop, rock climbing spot, yoga studio)

In [232]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,East York,4,Fast Food Restaurant,Pizza Place,Gastropub,Pharmacy,Rock Climbing Spot
67,Central Toronto,4,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Grocery Store
