In [1]:
pip install beautifulsoup4


The following command must be run outside of the IPython shell:

    $ pip install beautifulsoup4

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [2]:
pip install lxml


The following command must be run outside of the IPython shell:

    $ pip install lxml

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


## Question 1

In [31]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
# Scrape the web page list
Canada_List = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
s_canada = requests.get(Canada_List).text
bsoup = BeautifulSoup(s_canada, 'lxml')
canada_table = bsoup.find('table')

In [3]:
# Creating the new columns
columns_names = ['Postalcode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = columns_names)

In [4]:
# Search all the postcode, borough, neighborhood 
for tr in canada_table.find_all('tr'):
    r_data=[]
    for td in tr.find_all('td'):
        r_data.append(td.text.strip())
    if len(r_data)==3:
        df.loc[len(df)] = r_data

In [5]:
#  Ignore cells with a borough that is Not assigned
df=df[df['Borough']!='Not assigned']

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']

df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [6]:
# More than one neighborhood can exist in one postal code area.
# These two rows will be combined into one row with the neighborhoods 
# separated with a comma
groupb_table = df.groupby(['Postalcode','Borough'], sort=False).agg( ', '.join)
df_final=groupb_table.reset_index()
df_final.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [7]:
df_final.shape

(103, 3)

## Question 2
Get the latitude and the longitude coordinates of each neighborhood

In [8]:
df_geospatial=pd.read_csv('http://cocl.us/Geospatial_data')

In [9]:
df_geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_geospatial.rename(columns={'Postal Code':'Postalcode'},inplace=True)

# Merge the df of the question 1 to the geospatial data of each neighborhood
df_geospatial_merged = pd.merge(df_final,df_geospatial, on='Postalcode')

In [11]:
toronto_geodata = df_geospatial_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]

In [12]:
toronto_geodata.head(15)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## Question 3
Explore and cluster the neighborhoods in Toronto

In [13]:
from geopy.geocoders import Nominatim 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [14]:
toronto_adress = 'Toronto, ON'

tor_geolocator = Nominatim(user_agent="Toronto")
location = tor_geolocator.geocode(toronto_adress)

# Latitude and Longitude of Toronto
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('Latitude and Longitude of Toronto is, respectively, {}, {}.'.format(latitude_toronto, longitude_toronto))

Latitude and Longitude of Toronto is, respectively, 43.653963, -79.387207.


In [16]:
map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(toronto_geodata['Latitude'],
                                            toronto_geodata['Longitude'], 
                                            toronto_geodata['Borough'], 
                                            toronto_geodata['Neighborhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3154cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [17]:
# Radius and Limit of venues
radius=500
LIMIT=100

CLIENT_ID = 'S2W4RLHIQ10GTEZ5IKEKWWAH5VSQD1WFNLNRUPFJUHCAA1GY' # your Foursquare ID
CLIENT_SECRET = '42MERQXTX2OOFRJ0MK1QTFJBWDHF2CNNTU4K10QSS1K53V5V' # your Foursquare Secret
VERSION = '20200119'

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
venues_tor = getNearbyVenues(names=toronto_geodata['Neighborhood'],
                                   latitudes=toronto_geodata['Latitude'],
                                   longitudes=toronto_geodata['Longitude']
                                  )

Parkwoods
Victoria Village
Harbourfront
Lawrence Heights, Lawrence Manor
Queen's Park
Queen's Park
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The Danforth West, Riv

In [20]:
venues_tor.head(4)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [21]:
venues_tor.shape

(2221, 7)

In [22]:
# Quantity of venues of each Neighborhood
venues_tor.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",11,11,11,11,11,11
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Downsview North, Wilson Heights",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",26,26,26,26,26,26
Berczy Park,56,56,56,56,56,56
"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [23]:
# one hot encoding technique
toronto_onehotencoding = pd.get_dummies(venues_tor[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood again
toronto_onehotencoding['Neighborhood'] = venues_tor['Neighborhood'] 

# Neighborhood in the first column
fixed_columns = [toronto_onehotencoding.columns[-1]] + list(toronto_onehotencoding.columns[:-1])
toronto_onehotencoding.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
toronto_onehotencoding.shape

(2221, 271)

In [28]:
#  group rows by neighborhood and the mean of the frequency of occurrence 
group_toronto = toronto_onehotencoding.groupby('Neighborhood').mean().reset_index()
group_toronto

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020000,...,0.020000,0.00,0.000000,0.000000,0.000000,0.010000,0.000000,0.0,0.010000,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4,"Alderwood, Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.045455,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038462,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.038462,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.017857,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [29]:
# print the top 3 most common venus to each neighborhood
top_venues = 3

for y in group_toronto['Neighborhood']:
    print("----"+y+"----")
    nei = group_toronto[group_toronto['Neighborhood'] == y].T.reset_index()
    nei.columns = ['venue','freq']
    nei = nei.iloc[1:]
    nei['freq'] = nei['freq'].astype(float)
    nei = nei.round({'freq': 2})
    print(nei.sort_values('freq', ascending=False).reset_index(drop=True).head(top_venues))
    print('\n')

----Adelaide, King, Richmond----
         venue  freq
0  Coffee Shop  0.07
1         Café  0.04
2          Bar  0.04
3        Hotel  0.03
4   Steakhouse  0.03


----Agincourt----
                       venue  freq
0                     Lounge  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4          Accessories Store  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                 venue  freq
0           Playground   0.5
1                 Park   0.5
2    Accessories Store   0.0
3   Mexican Restaurant   0.0
4  Monument / Landmark   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0         Grocery Store  0.18
1           Pizza Place  0.09
2            Beer Store  0.09
3  Fast Food Restaurant  0.09
4        Sandwich Place  0.09


----Alderwood, Long Branch----
            venue  freq
0     Pizza P

In [33]:
#  Sort the venues in descending order

def common_venues(row, top_venues):
    categories = row.iloc[1:]
    categories_sorted = categories.sort_values(ascending=False)
    
    return categories_sorted.index.values[0:top_venues]

In [34]:
# 8 venues for each neighborhood 

top_venues = 8

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for i in np.arange(top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(i+1, indicators[i]))
    except:
        columns.append('{}th Most Common Venue'.format(i+1))

# New dataframe
nei_venues_sorted = pd.DataFrame(columns=columns)
nei_venues_sorted['Neighborhood'] = group_toronto['Neighborhood']

for i in np.arange(group_toronto.shape[0]):
    nei_venues_sorted.iloc[i, 1:] = common_venues(group_toronto.iloc[i, :], top_venues)

nei_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Bar,Café,Cosmetics Shop,Steakhouse,Asian Restaurant,Thai Restaurant,Burger Joint
1,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Skating Rink,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Beer Store,Pharmacy,Pizza Place,Liquor Store,Fried Chicken Joint,Fast Food Restaurant,Sandwich Place
4,"Alderwood, Long Branch",Pizza Place,Pharmacy,Skating Rink,Coffee Shop,Pool,Pub,Sandwich Place,Gym


In [38]:
# Run k-means to cluster the neighborhood into 5 clusters

kclusters = 5

toronto_cluster = group_toronto.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

In [44]:
# new dataframe with the cluster and the 8 venues 

# clustering labels
nei_venues_sorted.insert(0, 'Cluster_Labels', kmeans.labels_)

toronto_merged = toronto_geodata

# merge the dataframes to add latitude/longitude 
toronto_merged = toronto_merged.join(nei_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster_Labels,Cluster_Labels0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,4.0,Bus Stop,Park,Food & Drink Shop,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,0.0,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Pizza Place,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0.0,0.0,Coffee Shop,Pub,Park,Bakery,Mexican Restaurant,Breakfast Spot,Café,Beer Store
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0.0,0.0,Clothing Store,Accessories Store,Arts & Crafts Store,Furniture / Home Store,Event Space,Miscellaneous Shop,Coffee Shop,Boutique
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0.0,0.0,Coffee Shop,Gym,Park,College Cafeteria,Sculpture Garden,Burger Joint,Sandwich Place,Burrito Place


In [45]:
# drop rows with midding data
toronto_merged=toronto_merged.dropna()

toronto_merged['Cluster_Labels'] = toronto_merged.Cluster_Labels.astype(int)

In [48]:
# create map
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# color scheme 
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow_color = [colors.rgb2hex(i) for i in colors_array]

# add markers
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow_color[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters