In [1]:
#import necessary modules

import csv
import requests
import pandas as pd
from bs4 import BeautifulSoup as BS

### Data Acquisition

The data is available in the form of the table on the wiki page <a href = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'>here</a>
The below code scraps the data from the given web page and tranforms the table into a dataframe which is readily available for segmentation.

The table tag has a class associated with it, <code>class = wikitable sortable</code>

In [2]:
# get the response and the page data

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

In [3]:
page = BS(response.text, 'html.parser')

In [4]:
# get table by class

table = page.findAll('table', class_ = "wikitable sortable")[0]

# find all the table row
table_row = table.findAll(['tr'])

In [5]:
# save the table as a csv file

with open('postal_code.csv', 'w', newline = '') as postal:
    writer = csv.writer(postal)
    
    # find all the table headers
    th = table.findAll('th')
    headers = [head.text.strip('\n') for head in th]
    
    writer.writerow(headers)
    
    for cell in table_row[1:]:
        
        td = cell.findAll('td')
        rows = [val.text.replace('\n', '') for val in td]
        
        # Ignore the Not assigned Borough
        if rows[1] != 'Not assigned':
            if rows[2] == 'Not assigned':
                rows[2] = rows[1]
            print(rows)
            writer.writerow(rows)

['M3A', 'North York', 'Parkwoods']
['M4A', 'North York', 'Victoria Village']
['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront']
['M6A', 'North York', 'Lawrence Manor, Lawrence Heights']
['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"]
['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village']
['M1B', 'Scarborough', 'Malvern, Rouge']
['M3B', 'North York', 'Don Mills']
['M4B', 'East York', 'Parkview Hill, Woodbine Gardens']
['M5B', 'Downtown Toronto', 'Garden District, Ryerson']
['M6B', 'North York', 'Glencairn']
['M9B', 'Etobicoke', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale']
['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek']
['M3C', 'North York', 'Don Mills']
['M4C', 'East York', 'Woodbine Heights']
['M5C', 'Downtown Toronto', 'St. James Town']
['M6C', 'York', 'Humewood-Cedarvale']
['M9C', 'Etobicoke', 'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood']
['M1E', 'Scarborough', 'Guildwood, Morni

In [6]:
# lets look at the dataframe created from scraping the web page
df = pd.read_csv('postal_code.csv')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df.shape

(103, 3)

In [8]:
# lets take a look at the latitude and longitude 
# associated with the postal codes

lat_lng = pd.read_csv('Geospatial_Coordinates.csv')
lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Merge the co-ordinates with the neighbourhood to obtain the final dataframe

df = df.merge(lat_lng, on = 'Postal Code', how = 'left')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
df.shape

(103, 5)

In [11]:
# Save the csv file
df.to_csv('Toronto Neighbourhood Data.csv')