# Segmenting and Clustering Neighborhoods in Toronto

Data on neighborhoods in Toronto is scraped from the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

## Libraries Used

In [143]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import folium
from sklearn.cluster import KMeans
from matplotlib import cm
from matplotlib import colors

## Scrape data from wikipedia with BeautifulSoup

In [94]:
url =  "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_content = requests.get(url).text

In [95]:
soup = BeautifulSoup(html_content, 'xml')

In [96]:
table = soup.find('table')

In [97]:
# Create a dataframe with 3 columns: PostalCode, Borough and Neighborhood
col = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = col)

In [98]:
# Get all data in PostalCode, Borough and Neighborhood 
for tr in table.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row)==3:
        df.loc[len(df)] = row

In [99]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Processing and Data Cleaning

In [100]:
# Remove row with a borough that is Not assigned
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)

In [101]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [102]:
# Combine neighbourhoods with same Postalcode
df = df.groupby(['Postalcode','Borough'], sort=False).agg(', '.join)
df = pd.DataFrame(temp).reset_index()

In [103]:
df.shape

(103, 4)

### Adding Geospacial Data to Dataframe

In [105]:
geo_data = pd.read_csv('http://cocl.us/Geospatial_data')

In [106]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [107]:
geo_data = geo_data.rename(columns={'Postal Code':'Postalcode'})
df = pd.merge(df, geo_data, on='Postalcode')
df = df.drop(['index'], axis=1)

In [111]:
df.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


## Plot the neighborhoods in Toronto with Folium

In [116]:
# Getting data that rows that contain Toronto in Borough
toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop = True)

In [117]:
toronto.head()

Unnamed: 0,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,M4E,43.676357,-79.293031,East Toronto,The Beaches
1,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
2,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
3,M4M,43.659526,-79.340923,East Toronto,Studio District
4,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [149]:
# Plotting a map with the dataframe above
map_toronto = folium.Map(location=[43.6532,-79.3832],zoom_start=12)

for lat, lon, bor, neigh in zip(toronto['Latitude'],toronto['Longitude'],toronto['Borough'],toronto['Neighborhood']):
    label = '{}, {}'.format(neigh, bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lon],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.5,
    parse_html=False).add_to(map_toronto)

map_toronto

### * Map plotted might not be shown. Please visit the **Images** folder for **Map 1.PNG**

## Cluster the neighborhoods in Toronto with K-Means Clustering

In [135]:
k=5
clustering = toronto.drop(['Postalcode','Borough','Neighborhood'], axis = 1)
kmeans = KMeans(n_clusters = k, random_state=0).fit(clustering)
kmeans.labels_
toronto.insert(0, 'Cluster Labels', kmeans.labels_)

In [136]:
toronto.head()

Unnamed: 0,Cluster Labels,Postalcode,Latitude,Longitude,Borough,Neighborhood
0,4,M4E,43.676357,-79.293031,East Toronto,The Beaches
1,4,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
2,4,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
3,4,M4M,43.659526,-79.340923,East Toronto,Studio District
4,2,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


## Plot the neighborhoods in Toronto after clustering with Folium

In [147]:
map_cluster = folium.Map(location=[43.6532,-79.3832], zoom_start=12)

# set color for the clusters
x = np.arange(k)
ys = [x + i + (i*x)**2 for i in range(k)] 
colors_array = cm.magma(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood'], toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5).add_to(map_cluster)
       
map_cluster

### * Map plotted might not be shown. Please visit the **Images** folder for **Map 2.PNG**