In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

In [27]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [77]:
r = requests.get(url,headers = headers)
html = r.content.decode()
soup = BeautifulSoup(html,'lxml')

### Get Data

In [45]:
def get_postcodes():
    codes=[]
    for i in soup.select('#mw-content-text > div > table.wikitable.sortable > tbody > tr>td:nth-child(1)'):
        code = i.string
        codes.append(code)
    return codes                       

In [47]:
def get_bouroughs():
    boroughs=[]
    for i in soup.select('#mw-content-text > div > table.wikitable.sortable > tbody > tr>td:nth-child(2)'):
         borough = i.string
         boroughs.append(borough)
    return boroughs

In [66]:
def get_neighbourhoods():
    neighbourhoods=[]
    for i in soup.select('#mw-content-text > div > table.wikitable.sortable > tbody > tr>td:nth-child(3)'):
        neighbourhood = i.get_text().replace('\n','')
        neighbourhoods.append(neighbourhood)
    return neighbourhoods

### Transfer to Dataframe

In [67]:
a = get_postcodes()
b = get_bouroughs()
c = get_neighbourhoods()

In [68]:
toronto_df = pd.DataFrame({"PostalCode": a,
                           "Borough":b,
                           "Neighborhood":c})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


###  Ignore cells with a borough that is Not assigned.

In [70]:
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### More than one neighborhood can exist in one postal code area

In [71]:
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [72]:
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Final Dataframe Shape

In [74]:
toronto_df_grouped.shape

(103, 3)