import data

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
wiki=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
wiki_data=BeautifulSoup(wiki,'lxml')

Filter data by 'table', use all data in 'th' to make dataframe. The dataframe consist 3 columns: PostalCode, Borough and Neighbourhood

In [3]:
table = wiki_data.find('table')
table_title_list = []
for table_title in table.find_all('th'):
    table_title_list.append(table_title.text)
table_title_list = [x.strip('\n') for x in table_title_list]

column_names = table_title_list 
df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


filter table data by 'td' and make a compiled list, append them in df 

In [4]:
table_content_list = []
for table_content in table.find_all('td'):
    table_content_list.append(table_content.text)
table_content_list = [x.strip('\n') for x in table_content_list]

df['Postcode'] = table_content_list[::3]
df['Borough'] = table_content_list[1::3]
df['Neighbourhood'] = table_content_list[2::3]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Ignore cells with a borough that is Not assigned

In [5]:
df_borough_true = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df_borough_true.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


combined the Neighbourhood with same Postcode

In [6]:
df_group = df_borough_true.groupby('Postcode').agg({'Borough':'first', 
                                                             'Neighbourhood': ', '.join}).reset_index()
df_group

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


If the Neighbourhood is 'Not assigned', it would have the same value as the Borough

In [7]:
df_group.Neighbourhood[df_group.Neighbourhood == 'Not assigned'] = df_group.Borough
df_group

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


use the .shape method to print the number of rows of your dataframe

In [8]:
df_group.shape

(103, 3)

In [9]:
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests # library to handle requests
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 
print('Libraries imported.')

Libraries imported.


load toronto geographical data in csv, then change column name

In [10]:
toronto_geo = pd.read_csv("http://cocl.us/Geospatial_data")
toronto_geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)
toronto_geo

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [11]:
df_withgeo = pd.merge(df_group, toronto_geo, on='Postcode', how='outer')
df_withgeo

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [12]:
df_withgeo.shape

(103, 5)