In [6]:
import pandas as pd

# Read the table
# The table headers are in row 0
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

# Create the initial dataframe from the table
df = pd.DataFrame(data = table[0])

#number of rows and columns
print(df.shape)
# Sample output of the Table
df.head()


(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront



# Handle rows where Borough is set but Neighbourhood is Not assigned

### Some of the rows have Borough set but Neighbourhood is Not assigned. Setting Neighbourhood the same as the Borough value.

In [7]:
df[(df.Borough != 'Not assigned') & (df.Neighbourhood == 'Not assigned')]

Unnamed: 0,Postcode,Borough,Neighbourhood
7,M7A,Queen's Park,Not assigned


Only one row ! Will fix manually

In [9]:
df.loc[df.Borough == "Queen's Park", 'Neighbourhood'] = "Queen's Park"

In [10]:
df[(df.Borough != 'Not assigned') & (df.Neighbourhood == 'Not assigned')]

Unnamed: 0,Postcode,Borough,Neighbourhood


This means it is fixed

# Remove rows where Borough & Neighbourhood are Not assigned

In [11]:
df = df[(df.Borough != 'Not assigned') | (df.Neighbourhood != 'Not assigned')]

Now checking our table again

In [12]:
#number of rows and columns
print(df.shape)
# Sample output of the Table
df.head()

(210, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


# Group by Postal Code and Borough

In [17]:
new_d = pd.DataFrame(df.groupby(
    ['Postcode', 'Borough'])['Neighbourhood'].apply(
    lambda x: ', '.join(x))).reset_index()

In [18]:
new_d.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


checking last 5 rows as well

In [19]:
new_d.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [20]:
new_d.shape

(103, 3)