# 1, Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from bs4 import BeautifulSoup
import requests

# 2, Scrap data into a dataframe

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
content = requests.get(url = url).text
soup = BeautifulSoup(content, "html.parser")

In [3]:
s_table = soup.find("table", {"class": "wikitable"})
s_col = s_table.find_all("th")
s_cell = s_table.find_all("td")
cols = [i.get_text().replace("\n","") for i in s_col]
cells = [i.get_text().replace("\n","") for i in s_cell]
l1, l2, l3 = [], [], []
for idx, val in enumerate(cells):
    if idx % 3 == 0:
        l1.append(val)
    elif idx % 3 == 1:
        l2.append(val)
    else:
        l3.append(val)
vals = [l1, l2, l3]
df = pd.DataFrame(dict(zip(cols, vals)))
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 3, Remove "Not assigned" borough

In [4]:
df1 = df.drop(df[df.Borough == "Not assigned"].index)

# 4, Combine neighbourhoods of the same area into 1 row

In [5]:
def combine_nei(x):
    return ", ".join(x)
df2 = pd.DataFrame(df1.groupby(["Postcode", "Borough"]).Neighbourhood.apply(combine_nei)).reset_index()
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 5, Replace "Not assigned" neighbourhood by borough name

In [6]:
test_na = df2[df2.Neighbourhood.str.contains("Not assigned")]
test_na

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Not assigned


In [7]:
df2["Neighbourhood"] = df2.Neighbourhood.replace("Not assigned", df2.Borough)
df2.loc[test_na.index]

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


# 6, Print the number of rows of result

In [8]:
df2.shape

(103, 3)

# 7, Create output file for future usage

In [9]:
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [10]:
df2.to_csv("file1.csv")