### Import Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests
import numpy as np

### Use url to get a string of web content
### Use Beautifulsoup to find the table
### Read the html and replace "Not assigned" with NaN
### Drop rows having NaN in col "Borough"
### Fill NaN in col "Neighbourhood" with its "Borough"

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
content = requests.get(url).content
web_content = BeautifulSoup(content)
web_content = web_content.find("table")
df = pd.read_html(str(web_content).replace("Not assigned", "NaN"), header=0)[0]
df = df.dropna(subset=["Borough"]).reset_index(drop=True)
df.fillna(method="ffill",axis=1, inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### Group Neighbourhood with Postcode and Brough and join them together
### Format the dataframe

In [3]:
df_modified = pd.DataFrame(df.groupby(["Postcode", "Borough"]).apply(lambda x:[', '.join(x["Neighbourhood"])][0])).reset_index()
df_modified.columns = df.columns
df_modified

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
df_modified.shape

(103, 3)

### Read Coordinate Information

In [5]:
geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


### Add two col Latitude and Longitude
### Then fill them with the coordinates inside the geo_data we've created before

In [6]:
df_geo = df_modified
df_geo["Latitude"] = np.nan
df_geo["Longitude"] = np.nan
for i, code in enumerate(df_modified["Postcode"]):
    df_geo.loc[i, ["Latitude", "Longitude"]] = geo_data[geo_data["Postal Code"]==code][["Latitude", "Longitude"]].values[0]
    
df_geo = df_geo[df_geo["Borough"].str.contains("Toronto")]
df_geo.to_csv("Toronto_data.csv")
toronto_data = df_geo
df_geo

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [7]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [8]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['Borough'], df_geo['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

In [9]:
toronto_grouped = pd.read_csv("toronto_grouped.csv")
neighborhoods_venues_sorted = pd.read_csv("neighborhoods_venues_sorted.csv")

# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [10]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged # check the last columns!

Unnamed: 0.1,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,35,Health Food Store,Neighborhood,Pub,Coffee Shop,Other Great Outdoors,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,37,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Yoga Studio,Cosmetics Shop,Brewery,Bubble Tea Shop,Caribbean Restaurant
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,36,Gym,Italian Restaurant,Coffee Shop,Pub,Movie Theater,Sandwich Place,Burrito Place,Burger Joint,Brewery,Fast Food Restaurant
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,33,Café,Coffee Shop,Gastropub,Italian Restaurant,American Restaurant,Bakery,Yoga Studio,Convenience Store,Brewery,Seafood Restaurant
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,22,Park,Swim School,Bus Line,Yoga Studio,Dog Run,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,12,Sandwich Place,Breakfast Spot,Gym,Park,Hotel,Food & Drink Shop,Burger Joint,Asian Restaurant,Eastern European Restaurant,Doner Restaurant
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2,25,Clothing Store,Coffee Shop,Sporting Goods Shop,Yoga Studio,Park,Sandwich Place,Salon / Barbershop,Rental Car Location,Chinese Restaurant,Gym / Fitness Center
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,11,Dessert Shop,Sandwich Place,Pizza Place,Sushi Restaurant,Italian Restaurant,Café,Coffee Shop,Thai Restaurant,American Restaurant,Pharmacy
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,24,Restaurant,Playground,Tennis Court,Summer Camp,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Electronics Store
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2,13,Pub,Convenience Store,Coffee Shop,American Restaurant,Bagel Shop,Sushi Restaurant,Supermarket,Sports Bar,Light Rail Station,Medical Center


In [11]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    cluster = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters