In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import numpy as np
import json 

# built a dataframe of the postal code of each neighborhood 

In [2]:
# pass the HTML file 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'lxml')

In [3]:
# get table 
postcodetable = soup.table.text.split('\n')
# read table
postcode = postcodetable[2::5]
borough = postcodetable[3::5]
neighbourhood = postcodetable[4::5]

In [4]:
# create pandas structure 
# define the dataframe columns
column_names = [postcode[0], borough[0], neighbourhood[0]] 

# instantiate the dataframe
postcodes = pd.DataFrame(columns=column_names)

# instantiate the data 
for i in range(len(postcode)-1):
    
# ignore 'not assigned' data
    if borough[i+1] != 'Not assigned':
        postcodes = postcodes.append({'Postcode':postcode[i+1],'Borough': borough[i+1],
                                          'Neighbourhood': neighbourhood[i+1]}, ignore_index=True)

In [5]:
# process data 

# concatenate same postcodes
postcodes = postcodes.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

# process 'not assigned' data
i_nan = np.where(postcodes['Neighbourhood']=='Not assigned')[0]
postcodes['Neighbourhood'][i_nan] = postcodes['Borough'][i_nan]

In [6]:
postcodes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [7]:
postcodes.shape

(103, 3)

# get position information

In [8]:
!wget -q -O 'toronto_data.json' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [9]:
# readdata and build up structure 
with open('toronto_data.json') as json_data:
    a = json_data.read()
    
# create pandas structure 
# define the dataframe columns
column_names = a.split('\n')[0].split(',')

# instantiate the dataframe
positions = pd.DataFrame(columns=column_names)
positions = positions.rename(columns={"Postal Code": "Postcode"})

# instantiate the data 
for i in range(len(a.split('\n'))-1):
    positions = positions.append({'Postcode':a.split('\n')[i+1].split(',')[0],'Latitude': a.split('\n')[i+1].split(',')[1],
                                          'Longitude':a.split('\n')[i+1].split(',')[2]}, ignore_index=True)

In [10]:
# merge position information and postcodes
toronto = pd.merge(postcodes, positions)

In [11]:
toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7532586,-79.3296565
1,M4A,North York,Victoria Village,43.7258823,-79.3155716
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.6542599,-79.3606359
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.4647633
4,M7A,Queen's Park,Queen's Park,43.6623015,-79.3894938
5,M9A,Etobicoke,Islington Avenue,43.6678556,-79.5322424
6,M1B,Scarborough,"Rouge,Malvern",43.8066863,-79.1943534
7,M3B,North York,Don Mills North,43.7459058,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.7063972,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.6571618,-79.3789371
