# The Capstone-Project

### This notebook is used for creating database of Toronto city Neighborhoods

In [100]:
#!pip install beautifulsoup4
#!pip install lxml
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import requests
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


from IPython.display import display_html
import pandas as pd
import numpy as np
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Scraping the data using Beautiful Soup

In [101]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source)
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postal Code,Borough,Neighborhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


# Creating a DataFrame

In [102]:

dfs=pd.read_html(tab)
df=dfs[0]

In [103]:
df_a=df.copy()
df_a=df_a[df_a['Borough']!='Not assigned'] #Remove Not Assigned values in Borough
df_a.loc[df_a['Neighborhood']=='Not Assigned','Neighborhood']=df_a['Borough']
df_a.groupby(['Postal Code','Borough'],sort=False).agg(",".join)
df_a

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [104]:
#Examining the shape of the data
df_a.shape

(103, 3)

In [105]:
latlon=pd.read_csv('Geospatial_Coordinates.csv')
df_can=pd.merge(df_a,latlon,on='Postal Code')
df_can

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [106]:
#Separating boroughs which contain 'Toronto'
df_tor=df_can[df_can['Borough'].str.contains('Toronto')]
df_tor.reset_index(drop=True,inplace=True)
df_tor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Visualizing Toronto Neighborhoods

In [107]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighborhood in zip(df_tor['Latitude'],df_tor['Longitude'],df_tor['Borough'],df_tor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

## Modeling the clusters using K-Means

In [108]:
n=4
k_means = KMeans(init = "k-means++", n_clusters = n, n_init = 12)
k_means.fit(df_tor[['Latitude','Longitude']])
labels = k_means.labels_
print(len(labels),labels)

39 [1 1 1 1 3 1 1 2 1 2 1 2 3 1 2 3 1 3 0 0 0 0 2 0 1 2 0 1 2 0 1 0 1 1 1 1 1
 1 3]


In [109]:
df_tor.insert(0, 'Cluster_no',labels)
df_tor

Unnamed: 0,Cluster_no,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,1,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,1,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,2,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,1,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,2,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Visualizing the Clusters on Toronto Map

In [110]:
map_clusters = folium.Map(location=[43.651070,-79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(n)
ys = [i + x + (i*x)**2 for i in range(n)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_tor['Latitude'],df_tor['Longitude'],df_tor['Neighborhood'], df_tor['Cluster_no']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters