 # Extracting the Data of Toronto Neighborhoods from Wikipedia

Importing the necessary libraries to extract the data from Wikipedia

In [37]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [38]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned":
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


In [39]:
#Cleaning the Data
df["PostalCode"] = df["PostalCode"].str.replace(r"\n","")
df["Borough"] = df["Borough"].str.replace(r"\n","")
df["Neighborhood"] = df["Neighborhood"].str.replace(r"\n","")
df["Neighborhood"] = df["Neighborhood"].str.replace(r"\/","")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park Harbourfront


In [40]:
df = df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern Rouge
2,M1C,Scarborough,Rouge Hill Port Union Highland Creek
3,M1E,Scarborough,Guildwood Morningside West Hill
4,M1G,Scarborough,Woburn


In [41]:
print("Shape: ", df.shape)

Shape:  (180, 3)
