# Toronto Postal Codes

In [15]:
import pandas as pd
import numpy as np

## Scrape Web to Import Data

In [16]:
# Original table from Wikipedia
wiki_table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
wiki_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7ADowntown Toronto(Queen's Park / Ontario Pro...,M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [17]:
# Convert from wide to long format
melted_table = pd.melt(wiki_table)[['value']]
melted_table.head()
df = melted_table

## Formatting

In [18]:
# Extract postal code, first 3 chars
df['PostalCode'] = melted_table['value'].str[:3]
df.sample(5)

Unnamed: 0,value,PostalCode
149,M8MNot assigned,M8M
27,M2KNorth York(Bayview Village),M2K
63,M4EEast Toronto(The Beaches),M4E
147,M8KNot assigned,M8K
122,M7CNot assigned,M7C


In [19]:
# Extract Borough
df['Borough'] = df['value'].str.slice(3)

# Split Borough at '(', resulting in two columns since we use expand=True
df[['Borough','Neighborhood']] = df['Borough'].str.split(pat="(",n=1,expand=True)

df = df.drop(columns='value')

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern / Rouge)
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek)
3,M1E,Scarborough,Guildwood / Morningside / West Hill)
4,M1G,Scarborough,Woburn)


In [20]:
# Separate neighborhoods with a comma instead of slash
df['Neighborhood'] = df['Neighborhood'].str.replace(')','')
df['Neighborhood'] = df['Neighborhood'].str.replace(' / ',', ')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


In [21]:
# Finally, we drop rows where borough is Not assigned
df = df[~df.Borough.str.contains("Not assigned")]
df.sample(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
65,M4H,East York,Thorncliffe Park
17,M1X,Scarborough,Upper Rouge
176,M9W,EtobicokeNorthwest,"Clairville, Humberwood, Woodbine Downs, West H..."
108,M6L,North York,"North Park, Maple Leaf Park, Upwood Park"
120,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [22]:
# Last, we check the shape of the DataFrame
print(df.shape)

(103, 3)


In [33]:
# Save to project directory
df.to_csv('Dataset_Toronto_Postal_Codes.csv',index=False)