In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Scrape the data from the webpage

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response=requests.get(url)
soup=BeautifulSoup(response.text,'html.parser')
table=soup.find('table',{'class':'wikitable sortable'}).tbody
rows=table.find_all('tr')
#scrape title of dataframe
columns= [v.text.replace('\n','') for v in rows[0].find_all('th')]
df=pd.DataFrame(columns=columns)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


In [3]:
#from webpage copy the rows to the dataframe 
for i in range(1,len(rows)):
    tds=rows[i].find_all('td')
    values=[tds[0].text,tds[1].text,tds[2].text.replace('\n','')]
    df=df.append(pd.Series(values,index=columns),ignore_index=True)
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
df.shape

(288, 3)

### clean dataframe

In [5]:
#drop 'not assigned row in "Borough'column"'
df=df[df.Borough!='Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [6]:
df.reset_index(drop=True,inplace=True)

In [7]:
#if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
for i in range(0,df.shape[0]):
    if df.iloc[i][2]=='Not assigned':
        df.iloc[i][2]=df.iloc[i][1]
        i=i+1
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [8]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row
df=df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)
df              

Postcode  Borough    
M1B       Scarborough                                     [Rouge, Malvern]
M1C       Scarborough             [Highland Creek, Rouge Hill, Port Union]
M1E       Scarborough                  [Guildwood, Morningside, West Hill]
M1G       Scarborough                                             [Woburn]
M1H       Scarborough                                          [Cedarbrae]
                                               ...                        
M9N       York                                                    [Weston]
M9P       Etobicoke                                            [Westmount]
M9R       Etobicoke      [Kingsview Village, Martin Grove Gardens, Rich...
M9V       Etobicoke      [Albion Gardens, Beaumond Heights, Humbergate,...
M9W       Etobicoke                                            [Northwest]
Name: Neighbourhood, Length: 103, dtype: object

In [10]:
df=df.apply(lambda x: ','.join(x)).to_frame().reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"
