In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

### Get information from Wiki

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text   
soup = BeautifulSoup(source, 'html.parser')

In [3]:
PostalCodeList = []
BoroughList = []
NeighborhoodList = []
tbody = soup.find('tbody')

### Collect the Data

In [4]:
for index, value in enumerate(tbody.find_all('td')):
    if (index%3 == 0):
        PostalCodeList.append(value.text.strip())
    elif(index%3 == 1):
        BoroughList.append(value.text.strip())
    else:
        NeighborhoodList.append(value.text.strip())
dataDic = { "PostalCode":PostalCodeList, "Borough":BoroughList, "Neighborhood": NeighborhoodList }

### Translate to DataFrame

In [5]:
df = pd.DataFrame.from_dict(dataDic)
print( df.head() )

            Borough      Neighborhood PostalCode
0      Not assigned      Not assigned        M1A
1      Not assigned      Not assigned        M2A
2        North York         Parkwoods        M3A
3        North York  Victoria Village        M4A
4  Downtown Toronto      Harbourfront        M5A


### Remove column if Boroug is "Not assigned"

In [6]:
df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True, inplace=True)
print( df.head() )

            Borough      Neighborhood PostalCode
0        North York         Parkwoods        M3A
1        North York  Victoria Village        M4A
2  Downtown Toronto      Harbourfront        M5A
3  Downtown Toronto       Regent Park        M5A
4        North York  Lawrence Heights        M6A


In [7]:
aggregate_fun = {'PostalCode': 'first',
                 "Borough": 'first', 
                 "Neighborhood": lambda col: ','.join(col)}
df_new = df.groupby(df['PostalCode']).aggregate(aggregate_fun)

df_new.reset_index(drop=True, inplace=True)
#df_new

In [8]:
for index, row in df_new.iterrows():
    if (row.Neighborhood) == 'Not assigned':
        row.Neighborhood = row.Borough


In [9]:
print(df_new.iloc[85])

Neighborhood    Queen's Park
Borough         Queen's Park
PostalCode               M7A
Name: 85, dtype: object


### The shape of List of postal codes of Canada

In [15]:
df_new.shape

(103, 3)

### Get the atitude and the longitude coordinates  by geocoder

In [14]:
http://cocl.us/Geospatial_data

ImportError: No module named 'geocoder'

In [16]:
!wget -O geospatial.csv http://cocl.us/Geospatial_data

--2019-03-11 07:32:22--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 159.8.72.228
Connecting to cocl.us (cocl.us)|159.8.72.228|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2019-03-11 07:32:22--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|159.8.72.228|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-03-11 07:32:23--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.24.197, 107.152.25.197
Connecting to ibm.box.com (ibm.box.com)|107.152.24.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-03-11 07:32:23--  https://ibm.box.com/public/static/9afzr83pps

In [37]:
df_geo = pd.read_csv('geospatial.csv')
df_geo['PostalCode'] = df_geo['Postal Code']
df_geo_new = df_geo.drop(['Postal Code'], axis=1)
df_geo_new.head()


Unnamed: 0,Latitude,Longitude,PostalCode
0,43.806686,-79.194353,M1B
1,43.784535,-79.160497,M1C
2,43.763573,-79.188711,M1E
3,43.770992,-79.216917,M1G
4,43.773136,-79.239476,M1H


In [38]:
df_geo_new_index = df_geo_new.set_index('PostalCode')
df_geo_new_index.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [39]:
result = df_new.join(df_geo_new_index, on='PostalCode')
result

Unnamed: 0,Neighborhood,Borough,PostalCode,Latitude,Longitude
0,"Rouge,Malvern",Scarborough,M1B,43.806686,-79.194353
1,"Highland Creek,Rouge Hill,Port Union",Scarborough,M1C,43.784535,-79.160497
2,"Guildwood,Morningside,West Hill",Scarborough,M1E,43.763573,-79.188711
3,Woburn,Scarborough,M1G,43.770992,-79.216917
4,Cedarbrae,Scarborough,M1H,43.773136,-79.239476
5,Scarborough Village,Scarborough,M1J,43.744734,-79.239476
6,"East Birchmount Park,Ionview,Kennedy Park",Scarborough,M1K,43.727929,-79.262029
7,"Clairlea,Golden Mile,Oakridge",Scarborough,M1L,43.711112,-79.284577
8,"Cliffcrest,Cliffside,Scarborough Village West",Scarborough,M1M,43.716316,-79.239476
9,"Birch Cliff,Cliffside West",Scarborough,M1N,43.692657,-79.264848
