# Data handling
Ignore cells with a borough that is Not assigned

More than one neighborhood can exist in one postal code; these will be combined with the neighborhoods separated with a comma

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [193]:
# bs4 ref: https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
# bs4 ref: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

import pandas as pd
from bs4 import BeautifulSoup
import requests

result = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(result.text, 'lxml')
table_list = soup.find("table", class_ = "wikitable sortable")

PostalCode = []
Borough = []
Neighborhood = []

for entry in table_list.find_all('tr'):
    items = entry.find_all('td')
    if len(items) == 3:
        PostalCode.append(items[0].text)
        Borough.append(items[1].text)
        Neighborhood.append(items[2].text[:-1])   # to remove \n

df = pd.DataFrame({"PostalCode":PostalCode, "Borough":Borough, "Neighborhood":Neighborhood})

# Ignore cells with a borough that is Not assigned
df = df[df.Borough != "Not assigned"]

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
loc = df.Neighborhood == "Not assigned"
df["Neighborhood"][loc] = df["Borough"][loc]

# More than one neighborhood can exist in one postal code; these will be combined with the neighborhoods separated with a comma
df_final = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for name, group in df.groupby("PostalCode"):
    neighborhoods = ", ".join(group["Neighborhood"])
    df_final = df_final.append({"PostalCode":list(group["PostalCode"])[0], "Borough":list(group["Borough"])[0], "Neighborhood":neighborhoods}, ignore_index=True)

print(df_final.shape)

(103, 3)
