# Coursera - Applied Data Science Capstone

## Week 3 - Peer Graded Assignment: Neighborhoods in Toronto

## Part 1 - Creating our Dataframe

First import the necessary libraries to prepare our Dataframe

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

from bs4 import BeautifulSoup #import BeautifulSoup for screen scraping

In [2]:
#Scrape the Postal Code from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

Download the file locally and read into BeautifulSoup

In [3]:
#Download file from Wikipedia
!wget -q -O 'TorontoPostalCodeSoup.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
        
with open("TorontoPostalCodeSoup.html") as fp:
    mysoup = BeautifulSoup(fp)

Looping through beautifulsoup, we extract the values into 2 lists.
headerrow : This is the list of the column headers
body : This is the list of our extracted postal code, borough, neighborhood data

**Assumptions - The table on the Wikipage lists postal code, borough, and neighborhood as a single table. Any table that does not contain three columns are ignored**

In [4]:
headerrow=[]
body=[]
counter=0

for e in mysoup.find_all('tr'):

    if counter==0: #We go through the table header
        for f in e.find_all('th'):
            headerrow.append(str(f.string)[:-1])
    else:
        newrow=[]
        for f in e.find_all('td'):
            newrow.append(str(f.string)[:-1])
        
        #At this point, we've read one row of the table. We need to process it
        if (len(newrow)==3):
            #Only interested in the table of postal codes. Other tables are ignored
            
            newrow[2]=newrow[2].replace(" / ",",") #comma separate neighborhoods
            
            #f a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
            if (newrow[2]=="Not assigned" and newrow[1]!=""):
                newrow[2]=newrow[1]
                
            # Ignore cells with a borough that is Not assigned
            if (newrow[1]!="Not assigned"):
                body.append(newrow)

    counter+=1
    

Create our Dataframe "mydf" based on the Body and Headerrow lists

In [5]:
#print(headerrow)
mydf=pd.DataFrame(body,columns=headerrow)
mydf.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern,Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Part 2 - Adding Geo-coordinates of each Postal Code to our Dataframe

Next we want to get the geo coordinates of each neighborhood

In [6]:
mydf=mydf.sort_values(by=['Postal code'])
mydf.reset_index(drop=True, inplace=True)

In [7]:
mydf.shape

(103, 3)

We attempt to use Geocoder to extract the postal code latitude and longitude from Google

In [8]:
'''import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(mydf['Postal code']))
  print(g)
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
'''

"import geocoder # import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n  g = geocoder.google('{}, Toronto, Ontario'.format(mydf['Postal code']))\n  print(g)\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\n"

As the Geocoder was unreliable and returning "Request Denied", we will proceed with downloading a predefined Location list

In [9]:
!wget -q -O 'TorontoGeoData.csv' http://cocl.us/Geospatial_data
lat_lng_coords = pd.read_csv("TorontoGeoData.csv")

Next we add two new columns to our dataframe "Latitude" and "Longitude"

In [10]:
mydf['Latitude']=""
mydf['Longitude']=""
mydf.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",,
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",,
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,


We examine the imported lat_lng_coords dataframe to better understand it's structure

In [11]:
lat_lng_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Next we match the latitude/longitude for each postal code into our Dataframe (mydf)

In [12]:
#This double for-loop looks up the latitude/longitude of each Postal Code and stores it into mydf

for x in range(0, mydf.shape[0]):
    for y in range(0,lat_lng_coords.shape[0]):
        if (mydf['Postal code'][x]==lat_lng_coords['Postal Code'][y]):
            mydf['Latitude'][x]=lat_lng_coords['Latitude'][y]
            mydf['Longitude'][x]=lat_lng_coords['Longitude'][y]

In [13]:
mydf

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.8067,-79.1944
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park",43.7279,-79.262
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West",43.7163,-79.2395
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.6927,-79.2648


## Part 3 - Clustering and Analysing our Dataframe

### First we try Clustering by Borough

To achieve this, we take our dataframe (mydf) and create a new copy with only the Borough, Latitude, and Longitude. Checking the datatypes of Latitude/Longitude, we need to convert these into numeric values in order to group by Borough. During the grouping process, we will average the corresponding Lat/Lon values to get a representive gps coordinate of each Borough.

In [14]:
#Make a subcopy of our dataframe
mydfgroup = mydf[['Borough', 'Latitude', 'Longitude']].copy()

#Convert Latitude and Longitude to numeric datatypes for grouping
mydfgroup[["Latitude", "Longitude"]] = mydfgroup[["Latitude", "Longitude"]].apply(pd.to_numeric)

#With the grouping, we take the average Latitude/Longitude as the location of the Borough
mydfgroup=mydfgroup.groupby('Borough').mean().reset_index()

mydfgroup

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,West Toronto,43.652653,-79.44929
9,York,43.690797,-79.472633


Next we import our libraries

In [15]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

We will setup 5 clusters in the attempt to map to each of our Borough

In [16]:
# set number of clusters
kclusters = 5

mycluster = mydfgroup.drop('Borough',1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(mycluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:] 

array([3, 1, 1, 1, 0, 0, 3, 4, 2, 2], dtype=int32)

We add the determined cluster labels into our dataframe (mydfgroup)

In [17]:
# add clustering labels
mydfgroup.insert(3, 'Cluster Labels', kmeans.labels_)

mydfgroup.head() # check the last columns!

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels
0,Central Toronto,43.70198,-79.398954,3
1,Downtown Toronto,43.654597,-79.383972,1
2,East Toronto,43.669436,-79.324654,1
3,East York,43.700303,-79.335851,1
4,Etobicoke,43.660043,-79.542074,0


Next we try to plot our results in a Folium map with coloring based on the Cluster Labels

In [18]:
# create map
map_clusters = folium.Map(location=[43.654, -79.384], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(mydfgroup['Latitude'], mydfgroup['Longitude'], mydfgroup['Borough'], mydfgroup['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### We next try clustering based on Postal Code instead of Borough

We my a new dataframe (mydfgrouppc) based on the Postal Code this time instead of Borough

In [19]:

#Make a subcopy of our dataframe based on Postal Code
mydfgrouppc = mydf[['Postal code', 'Latitude', 'Longitude']].copy()
mydfgrouppc.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.8067,-79.1944
1,M1C,43.7845,-79.1605
2,M1E,43.7636,-79.1887
3,M1G,43.771,-79.2169
4,M1H,43.7731,-79.2395


We run the k-means clustering algorithm to determine the labels of each postal code

In [20]:
myclusterpc = mydfgrouppc.drop('Postal code',1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(myclusterpc)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:] 

array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 4, 1, 4, 4, 4,
       4, 4, 4, 1, 1, 1, 4, 4, 4, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3,
       3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 0, 3, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Next we add the Cluster Labels onto our Dataframe (mydfgrouppc) and plot the results on Folium map

In [21]:
# add clustering labels
mydfgrouppc.insert(3, 'Cluster Labels', kmeans.labels_)

mydfgrouppc.head() # check the last columns!

Unnamed: 0,Postal code,Latitude,Longitude,Cluster Labels
0,M1B,43.8067,-79.1944,2
1,M1C,43.7845,-79.1605,2
2,M1E,43.7636,-79.1887,2
3,M1G,43.771,-79.2169,2
4,M1H,43.7731,-79.2395,2


In [22]:
# create map
map_clusterspc = folium.Map(location=[43.654, -79.384], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(mydfgrouppc['Latitude'], mydfgrouppc['Longitude'], mydfgrouppc['Postal code'], mydfgrouppc['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusterspc)
       
map_clusterspc