In [40]:
import requests
from bs4 import BeautifulSoup

res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content, 'html.parser')

table_data = soup.select('table.wikitable.sortable')

for row in table_data:
    if len(row.get_text()) > 0:
        postal_codes = row.get_text().split('\n')

### Scrape postal codes in Toronto from wikipedia

Then make a list which has each elements in it.

In [48]:
postal_codes = [i for i in postal_codes if i != '']
postal_codes

['Postcode',
 'Borough',
 'Neighbourhood',
 'M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 'Downtown Toronto',
 "Queen's Park",
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 'Etobicoke',
 'Islington Avenue',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',

In [128]:
import numpy as np
postal_codes = np.array(postal_codes, dtype=object)

### Change postal_codes as a numpy array, then reshape it.

As each row of table has 3 components, reshape it which has 3 columns.

In [129]:
postal_codes = postal_codes.reshape(-1, 3)[1:,:]

In [130]:
postal_codes

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights'],
       ['M6A', 'North York', 'Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M8A', 'Not assigned', 'Not assigned'],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge'],
       ['M1B', 'Scarborough', 'Malvern'],
       ['M2B', 'Not assigned', 'Not assigned'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens'],
       ['M4B', 'East York', 'Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson'],
       ['M5B', 'Downtown Toronto', 'Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M7B', 'Not assigned', 'Not assigned'],
       ['M8B', 'Not assigned', 'Not assigned'],
       ['M9B', 'Etobicoke', 'Cloverdale'],
       ['M9B', 'Etobicoke', 'Islington'],
       [

### Delete the row which has unassigned borough.

In [131]:
assigned_postal_codes = np.array([])

for postal_code in postal_codes:
    if postal_code[1] != 'Not assigned':
        assigned_postal_codes = np.append(assigned_postal_codes.reshape(-1, 3), postal_code.reshape(-1, 3), axis=0)
        
assigned_postal_codes

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights'],
       ['M6A', 'North York', 'Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge'],
       ['M1B', 'Scarborough', 'Malvern'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens'],
       ['M4B', 'East York', 'Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson'],
       ['M5B', 'Downtown Toronto', 'Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M9B', 'Etobicoke', 'Cloverdale'],
       ['M9B', 'Etobicoke', 'Islington'],
       ['M9B', 'Etobicoke', 'Martin Grove'],
       ['M9B', 'Etobicoke', 'Princess Gardens'],
       ['M9B', 'Etobicoke', 'West Deane Park'],
       ['M1C', 'Scarborough', 'Highland Creek'],
       ['

### We got a numpy array which has information of postal codes!

Now let's combine the rows which have the same postal code. Then the neighborhoods will be separated with a comma.

**For example**, shown as below, in the row of postal code 'M6A'

'Lawrence Heights',  'Lawrence Manor' => 'Lawrence Heights, Lawrence Manor'

In [132]:
unique_postal_codes = assigned_postal_codes[0]

for postal_code in assigned_postal_codes[1:]:
    
    if postal_code[0] not in unique_postal_codes:
        
        unique_postal_codes = np.append(unique_postal_codes.reshape(-1, 3), postal_code.reshape(-1, 3), axis=0)
        
    else:
        for unique_postal_code in unique_postal_codes:
            if postal_code[0] == unique_postal_code[0]:
                unique_postal_code[2] = unique_postal_code[2] + ', ' + postal_code[2]

unique_postal_codes

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights, Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge, Malvern'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens, Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson, Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M9B', 'Etobicoke',
        'Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park'],
       ['M1C', 'Scarborough', 'Highland Creek, Rouge Hill, Port Union'],
       ['M3C', 'North York', 'Flemingdon Park, Don Mills South'],
       ['M4C', 'East York', 'Woodbine Heights'],
       ['M5C', 'Downtown Toronto', 'St. James Town'],
       ['M6C', 'York', 'Humewood-Cedarvale'],
       ['M9C', 'Etob

### Now, let's make a dataframe of postal code!

In [137]:
import pandas as pd
df_postal_code = pd.DataFrame(unique_postal_codes, columns=['PostalCode', 'Borough', 'Neighborhood'])
df_postal_code

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


### See the size of dataframe

In [138]:
df_postal_code.shape

(103, 3)