# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

print('Libraries imported.')

Libraries imported.


# Getting data from wikipedia

In [2]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=946126446"

toronto_data  = requests.get(url).text
toronto_data



# Creating a dataframe

In [3]:
soup = BeautifulSoup(toronto_data,"html5lib")  # Parse the data

table_contents=[]

for i in soup.find('table',{"class":"wikitable sortable"}).find('tbody').find_all('tr'):
    row = i.find_all("td")
    cell = {}
    if(row):
        if row[1].text != 'Not assigned': # Not select cells with a borough that is Not assigned.
            cell['PostalCode'] = row[0].text
            cell['Borough'] = row[1].text
            cell['Neighborhood'] = row[2].text
            table_contents.append(cell)
            
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head() 

Unnamed: 0,Borough,Neighborhood,PostalCode
0,North York,Parkwoods\n,M3A
1,North York,Victoria Village\n,M4A
2,Downtown Toronto,Harbourfront\n,M5A
3,North York,Lawrence Heights\n,M6A
4,North York,Lawrence Manor\n,M6A


Check cells who have Not assigned neighborhood

In [4]:
nan_neighborhood = (df['Neighborhood'] == 'Not assigned')
df[nan_neighborhood]

Unnamed: 0,Borough,Neighborhood,PostalCode


There is no Not assigned neighborhood.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 3 columns):
Borough         210 non-null object
Neighborhood    210 non-null object
PostalCode      210 non-null object
dtypes: object(3)
memory usage: 5.0+ KB


Dataframe has 210 rows.

In [6]:
df['PostalCode'].value_counts()

M9V    8
M8Y    8
M5V    7
M9B    5
M8Z    5
M4V    5
M1V    4
M6M    4
M9R    4
M9C    4
M5H    3
M1M    3
M1E    3
M2J    3
M8X    3
M6L    3
M5J    3
M1P    3
M3H    3
M8V    3
M1C    3
M1K    3
M1L    3
M1T    3
M6K    3
M5T    3
M5R    3
M4L    2
M6P    2
M5L    2
      ..
M7A    1
M4A    1
M4M    1
M5W    1
M6B    1
M2R    1
M6E    1
M1G    1
M4G    1
M3B    1
M4N    1
M3L    1
M4C    1
M5A    1
M2H    1
M5N    1
M3N    1
M1H    1
M2P    1
M4H    1
M4S    1
M4J    1
M4R    1
M7R    1
M4E    1
M3A    1
M1J    1
M1S    1
M1W    1
M6G    1
Name: PostalCode, Length: 103, dtype: int64

More than one neighborhood exist in one postal code area. These rows should be combined into one row with the neighborhoods separated with a comma. 

In [7]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(list)
df = df.sample(frac=1).reset_index()
df['Neighborhood'] = df['Neighborhood'].str.join(', ')
df['Neighborhood'] = df['Neighborhood'].replace('\n','',regex=True) # Delete \n
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M6P,West Toronto,"High Park, The Junction South"
1,M3N,North York,Downsview Northwest
2,M5N,Central Toronto,Roselawn
3,M5A,Downtown Toronto,Harbourfront
4,M6L,North York,"Downsview, North Park, Upwood Park"
5,M1N,Scarborough,"Birch Cliff, Cliffside West"
6,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
7,M4W,Downtown Toronto,Rosedale
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"


In [8]:
df.shape

(103, 3)