# Segmenting and Clustering Neighborhoods in Toronto

## 1. Scraping the geographic data from web page 

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# specify the url
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
table= soup.find('table')
table_contents=[]

In [3]:
# Search all the postcode, borough, neighborhood 
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [4]:
df.shape

(103, 3)

## 2. Read in the csv to get the latitude and the longitude coordinates of each neighborhood. 

In [None]:
df_l = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')
df_l.head

In [None]:
df_l.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df=pd.merge(df,df_l, on=['PostalCode'])

In [None]:
df

## 3. Analysis
### This part will be the clustering analysis for neighbors in Toronto. K-Nearest Neighbors method will be used to define the clusters and a map will be generated.

In [None]:
df_toronto = df[df['Borough'].str.contains('Toronto',regex=False)]
df_toronto

In [None]:
#use KNN method to do clustering 
from sklearn.cluster import KMeans
k=5
df_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'],axis=1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(df_clustering)
kmeans.labels_
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
df_toronto

In [None]:
# get Toronto address
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

### Make a map to show the cluster of neighborhood in Toronto

In [None]:
#make a map
# !conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters