# Segmenting and Clustering Neighborhoods in Toronto 

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

!conda install -c anaconda beautifulsoup4 --yes
from bs4 import BeautifulSoup

print('libraries imported')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    numpy-base-1.15.4          |   py36h81de0dd_0         4.2 MB  anaconda
    numpy-1.15.4               |   py36h1d66e8a_0          35 KB  anaconda
    beautifulsoup4-4.8.1       |           py36_0         153 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    soupsieve-1.9.5            |           py36_0          61 KB  anaconda
    mkl_fft-1.0.6              |   py36h7dd41cf_0         150 KB  anaconda
    certifi-2019.11.28         |           py36_0         156 KB  anaconda
    blas-1.0                   |           

In [2]:
import json 
import requests

import matplotlib.pyplot as plt
%matplotlib inline

### 1. Fetching html from Wikipedia

In [35]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### 2. Parsing html using BeautifulSoup

In [36]:
soup=BeautifulSoup(website_url.content,'html.parser')
#print(soup.prettify())

In [37]:
postal_table=soup.find('table',attrs= {'class':'wikitable sortable'})
postal_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_

In [38]:
t_headers = []
for th in postal_table.find_all("th"):
      # remove any newlines and extra spaces from left and right
    t_headers.append(th.text.replace('\n', ' ').strip())

### 3. Extracting data and moving it in Dataframe

In [39]:
table_data=[]

for tr in postal_table.find_all("tr"):
    t_row={}
    for td,th in zip(tr.find_all("td"),t_headers):
        t_row[th]=td.text.replace("\n","").strip()
    table_data.append(t_row)
        
print(table_data)
postal_canada = pd.DataFrame(table_data)

[{}, {'Postcode': 'M1A', 'Borough': 'Not assigned', 'Neighborhood': 'Not assigned'}, {'Postcode': 'M2A', 'Borough': 'Not assigned', 'Neighborhood': 'Not assigned'}, {'Postcode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}, {'Postcode': 'M4A', 'Borough': 'North York', 'Neighborhood': 'Victoria Village'}, {'Postcode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Harbourfront'}, {'Postcode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Heights'}, {'Postcode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Manor'}, {'Postcode': 'M7A', 'Borough': "Queen's Park", 'Neighborhood': 'Not assigned'}, {'Postcode': 'M8A', 'Borough': 'Not assigned', 'Neighborhood': 'Not assigned'}, {'Postcode': 'M9A', 'Borough': 'Downtown Toronto', 'Neighborhood': "Queen's Park"}, {'Postcode': 'M1B', 'Borough': 'Scarborough', 'Neighborhood': 'Rouge'}, {'Postcode': 'M1B', 'Borough': 'Scarborough', 'Neighborhood': 'Malvern'}, {'Postcode': 'M2B', 'Borough': 'Not assign

### 4. Deleting data where Borough = 'Not assigned'

In [40]:
postal_canada = postal_canada[postal_canada['Borough']!='Not assigned']
postal_canada.drop(postal_canada.index[0],inplace=True)

In [41]:
postal_canada.shape

(210, 3)

In [42]:
postal_canada.head()

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


### 5. Retrieving unique Postcode,Borough,Neighbo`rhood

In [43]:
print('Unique Postcodes',str(postal_canada['Postcode'].nunique()))
print('Unique Borough',str(postal_canada['Borough'].nunique()))
print('Unique Neighbourhood',str(postal_canada['Neighborhood'].nunique()))

Unique Postcodes 103
Unique Borough 11
Unique Neighbourhood 208


### 6. Sort Dataframe on PostCode and club DF on Neighborhood if two postcodes are same 

In [44]:
postal_canada=postal_canada.sort_values(by='Postcode')
#postal_canada.loc[postal_canada['Neighborhood'] == 'Not assigned', 'Neighborhood'] = canada_data['Borough']

In [45]:
postal_canada.shape

(210, 3)

In [47]:
postal_canada.groupby(['Postcode','Borough','Neighborhood'],as_index=False)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa359640320>

In [48]:
canada_data = postal_canada 

In [49]:
no_of_rows=len(canada_data)-1
no_of_rows

209

In [50]:
i=0
j=0

while i < no_of_rows :
    postcode1=canada_data.iloc[i,0]
    #print(i,postcode1)
    j=i+1
    postcode2=canada_data.iloc[j,0]
    #print(j,postcode2)
    neighborhood1=canada_data.iloc[int(i),2]
    neighborhood2=canada_data.iloc[int(j),2]
    #print(neighborhood1)
    #print(neighborhood2)
    if postcode1==postcode2:
        canada_data.iloc[i,2] = neighborhood1=neighborhood1+','+neighborhood2
        canada_data.drop(canada_data.index[j],inplace=True)
        no_of_rows=no_of_rows-1
        canada_data = canada_data.reset_index(drop=True)
    else:
        i=i+1


### 7.If Neighborhood = 'Not assigned' then Neighborhood = Borough

In [51]:
canada_data.loc[canada_data['Neighborhood'] == 'Not assigned', 'Neighborhood'] = canada_data['Borough']

In [53]:
print('Unique Postcodes',str(canada_data['Postcode'].nunique()))
print('Unique Borough',str(canada_data['Borough'].nunique()))
print('Unique Neighbourhood',str(canada_data['Neighborhood'].nunique()))

Unique Postcodes 103
Unique Borough 11
Unique Neighbourhood 102


In [55]:
canada_data.shape

(103, 3)

In [56]:
canada_data.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea"
8,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside"
9,M1N,Scarborough,"Cliffside West,Birch Cliff"
