# Capstone Project
This notebook will serve as the coding grounds for my capstone project as part of the requirement for the IBM professional certificate

In [9]:
# Libraries
import numpy as np
import pandas as pd

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


# Part I: Segmenting and Clustering Neighborhoods in Toronto

In [68]:
# Libraries
import requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
!pip install folium
import folium

print("Folium installed")

Folium installed


In [69]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dataframe = pd.read_html(url)
df = dataframe[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [70]:
# Removing boroughs that are not assigned
index = df[df['Borough'] == 'Not assigned'].index
df.drop(index, inplace=True)

In [91]:
# Neighbourhood to be the same as Borough if not assigned
for row in df.index:
    if df['Neighbourhood'][row] == 'Not assigned':
        df['Neighbourhood'][row] = df['Borough'][row]

In [95]:
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [113]:
# Grouping and aggregrating data into a single postcode
dfG = df.groupby(['Postcode', 'Borough'], as_index=False, sort=False).agg(','.join)

In [114]:
dfG.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


# Part II: Segmenting and Clustering Neighborhoods in Toronto

In [111]:
post = pd.read_csv("https://cocl.us/Geospatial_data")
post.groupby(['Postal Code'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f243d352c88>

In [122]:
dffinal = pd.merge(dfG, post, left_on = 'Postcode', right_on = 'Postal Code')

In [126]:
dffinal.drop(['Postal Code'], axis = 1, inplace=True)

In [127]:
dffinal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# Part III: Segmenting and Clustering Neighborhoods in Toronto

In [128]:
# Getting coordinates for Toronto
latitude = 43.6532
longitude = -79.3832

In [138]:
map_t = folium.Map(location=[latitude,longitude], zoom_start = 10)

for lat, lng, borough, neighbourhood in zip(dffinal['Latitude'], dffinal['Longitude'], dffinal['Borough'], dffinal['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_t)  

In [156]:
clusters = 5
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

clustered_df = dffinal.drop(['Neighbourhood', 'Borough', 'Postcode'], 1)
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(clustered_df)

In [157]:
dffinal.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [159]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dffinal['Latitude'], dffinal['Longitude'], dffinal['Neighbourhood'], dffinal['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [160]:
map_cluster

NameError: name 'map_cluster' is not defined