In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text

### Parse wiki doc

In [3]:
soup = BeautifulSoup(html,'html.parser')

In [4]:
df = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])
rows = soup.findAll('tr')
i=-0
for tr in rows:
    cells =tr.findAll('td')
    c1= [c.text for c in cells]
    if(len(c1) != 0 and c1[0].startswith('M') and ~c1[1].find("Not assigned")>=0):    
        df.loc[i] = [c1[0],c1[1],c1[2] ]
        i=i+1
       

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n


In [6]:
df.columns

Index(['PostalCode', 'Borough', 'Neighborhood'], dtype='object')

In [7]:
df.shape

(212, 3)

#### Replace unwanted text '\n' at the end of Neighborhood

In [8]:
df['Neighborhood'] = df['Neighborhood'].str.replace('\n','')

In [9]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### If borough is there but Neighborhood is not assigned then borough and neighborhood should be same

In [10]:
df[df['Borough'].str.contains('Park')]

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M7A,Queen's Park,Not assigned


In [11]:
#df[df['Neighborhood'].str.contains('Not')].apply(lambda col: df['Borough'])
df.loc[df['Neighborhood']=='Not assigned'] = df['Borough']


In [12]:
df[df['Borough'].str.contains('Park')]

Unnamed: 0,PostalCode,Borough,Neighborhood
6,Queen's Park,Queen's Park,Queen's Park


#### Merger more than one same neighborhood into a single row sepearted by columns

In [13]:
# Example row
df[df['PostalCode'].str.contains('M5A')]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park


In [14]:
#Merge all rows and create dataframe
dfNew = df.groupby(['PostalCode','Borough'],sort=False).agg(lambda col: ', '.join(col))
dfNew.reset_index(inplace=True)
dfNew[dfNew['PostalCode'].str.contains('M5A')]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [15]:
dfNew.shape

(103, 3)