In [2]:
import numpy as np
import pandas as pd
from lxml import html # Tool for webscraping
import requests # HTTP library to load wiki page
import re
from geopy.geocoders import Nominatim 

#### Making the request and scraping the data

In this part I used __lxml__ to scrape the data from the wiki webpage, I made this request using the __requests__ library

In [3]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
tree = html.document_fromstring(page.content)
data = tree.cssselect('.wikitable')
table = html.document_fromstring(data[0].text_content())



In [4]:
postal_info = list(filter(None, data[0].text_content().split('\n'))) # Spliting the raw table data to get a list

In [5]:
postal_info_data = postal_info[3:] # Get rid the first row

#### Creating the matrix

Using reshape on a numpy array that has a list with the Neighborhoods data

In [6]:
list_of_neigh = pd.DataFrame(np.array(postal_info_data).reshape((289,3)))

In [7]:
list_of_neigh = list_of_neigh.rename(columns={0:'PostalCode', 1:'Borough', 2: 'Neighborhood'}) # Adding column names

#### Convertign 'Not assigned' to NaN

To later use dataframe.dropna() to remove those unwanted rows

In [8]:
def cleanData(x):
    if(x == 'Not assigned'):
        return np.nan
    else:
        return x

list_of_neigh['Borough'] = list_of_neigh['Borough'].apply(lambda x: cleanData(x))

Removing all the rows with NaN 

In [9]:
list_of_neigh = list_of_neigh.dropna()
list_of_neigh = list_of_neigh.reset_index(drop=True)

In [10]:
postal_codes = list_of_neigh.groupby('Borough')['PostalCode'].unique()
#postal_codes['Central Toronto']

In [11]:
boroughs = list_of_neigh.groupby('Borough')['Neighborhood'].apply(lambda tags: ','.join(tags))
df_boroughs = pd.DataFrame(boroughs).reset_index()
#postal_codes.apply(lambda x: print(x))
df_boroughs['PostalCode'] = np.nan

for ind, item in enumerate(postal_codes):
    df_boroughs['PostalCode'][ind] = postal_codes[ind][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
# Changes the order of columns
reorder = [df_boroughs.columns[-1]] + list(df_boroughs.columns[:-1])

In [13]:
df_boroughs = df_boroughs[reorder]
# Set 'Not assigned' to the neighborhood borough
for ind, item in enumerate(df_boroughs['Neighborhood']):
    if(item == 'Not assigned'):
        df_boroughs['Neighborhood'][ind] = df_boroughs['Borough'][ind]

#### Fill the Latitude and Longitude columns with NaN

In [15]:
df_boroughs['Latitude'] = np.nan
df_boroughs['Longitude'] = np.nan
df_boroughs.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",,
1,M5A,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",,
2,M4E,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",,
3,M4B,East York,"Woodbine Gardens,Parkview Hill,Woodbine Height...",,
4,M9A,Etobicoke,"Islington Avenue,Cloverdale,Islington,Martin G...",,


In [18]:
def getCoords(postalcode):
    try:
        address = list(Nominatim(user_agent='my-application').geocode('{}, Toronto, Ontario'.format(postalcode)))[-1]
    except:
        address = (np.nan,np.nan)
    return address

lat_long = df_boroughs['PostalCode'].apply(lambda x: getCoords(x))


In [19]:
lat_long

0                    (nan, nan)
1     (43.6636637, -79.3675307)
2                    (nan, nan)
3                    (nan, nan)
4                    (nan, nan)
5                    (nan, nan)
6                    (nan, nan)
7       (43.653963, -79.387207)
8       (43.653963, -79.387207)
9                    (nan, nan)
10                   (nan, nan)
Name: PostalCode, dtype: object

After several tries to get the coordinates and due to the bad results I've decide it to use the csv

In [59]:
source = pd.read_csv('Geospatial_Coordinates.csv')

In [74]:
#Postal Codes as index
source = source.set_index('Postal Code')
source.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [79]:
df_boroughs.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",,
1,M5A,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",,
2,M4E,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",,
3,M4B,East York,"Woodbine Gardens,Parkview Hill,Woodbine Height...",,
4,M9A,Etobicoke,"Islington Avenue,Cloverdale,Islington,Martin G...",,


For each row I pick the postcode and with this I search the latitude and longitude from the previous dataframe (__source__)

In [88]:
for ind, item in enumerate(df_boroughs['PostalCode']):
    ll = source.loc[item]
    df_boroughs['Latitude'][ind] = ll['Latitude']
    df_boroughs['Longitude'][ind] = ll['Longitude']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [90]:
df_boroughs

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",43.72802,-79.38879
1,M5A,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636
2,M4E,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",43.676357,-79.293031
3,M4B,East York,"Woodbine Gardens,Parkview Hill,Woodbine Height...",43.706397,-79.309937
4,M9A,Etobicoke,"Islington Avenue,Cloverdale,Islington,Martin G...",43.667856,-79.532242
5,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
6,M3A,North York,"Parkwoods,Victoria Village,Lawrence Heights,La...",43.753259,-79.329656
7,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
8,M1B,Scarborough,"Rouge,Malvern,Highland Creek,Rouge Hill,Port U...",43.806686,-79.194353
9,M6H,West Toronto,"Dovercourt Village,Dufferin,Little Portugal,Tr...",43.669005,-79.442259
