In [13]:
#import libraries needed
#!conda install -c conda-forge beautifulsoup4 --yes #uncomment if install needed
from bs4 import BeautifulSoup #library for web scraping
import requests

import numpy as np #for handling data in vectorized manner

import pandas as pd #for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files


import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes #uncomment if install needed
import folium # map rendering library

print('Libraries imported.')


Libraries imported.


# Getting the webpage, parsing it using BeautifulSoup and creating the DataFrame

In [14]:
#use requests to get the link, and parse using BeautifulSoup
#link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url, 'html.parser')
#creating the new dataframe

po_df = pd.DataFrame()
i = 0
end = False 
#get the rows of table
for tr in soup.find_all('tr'):# find the <tr> tag and loop through to get rows
    j = 0 
    for tds in tr.find_all('td'):#loop <td> tag to get columns of row
        if (len(tds.text) > 1)& (not end): #make sure to get row we want
            po_df.loc[i,j]=tds.text 
            j = j+1
        else:
            end = True
    i =i+1


po_df.columns =['PostalCode', 'Borough', 'Neighbourhood']

print('shape of data frame is:', po_df.shape)

print(po_df.head())
            
    




shape of data frame is: (288, 3)
  PostalCode           Borough       Neighbourhood
1        M1A      Not assigned      Not assigned\n
2        M2A      Not assigned      Not assigned\n
3        M3A        North York         Parkwoods\n
4        M4A        North York  Victoria Village\n
5        M5A  Downtown Toronto      Harbourfront\n


# Cleaning Data

In [15]:
# remove \n, format data base look
po_df['Neighbourhood']=po_df['Neighbourhood'].str.replace('\n', '')
po_df=po_df.reset_index(drop=True)
po_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [16]:
#remove cells with borough not assigned 
po_df = po_df.drop(po_df.index[po_df['Borough']=='Not assigned'])

po_df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [17]:
#Merge neighbourhoods with the same postcode using .groupby function and a lambda function
post_df = po_df.groupby('PostalCode', sort=False, as_index=False).agg(lambda x:', '.join(set(x)))
post_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [18]:
#assign borough to neighbourhoods that are not assigned
for i in range(post_df.shape[0]):
    if post_df.loc[i,'Neighbourhood'] =='Not assigned':
        post_df.loc[i,'Neighbourhood'] = post_df.loc[i,'Borough']
post_df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [19]:
post_df.shape

(103, 3)

# Add Latituide and Longitdue data to post_df Data fram

In [20]:
#Attempted to use geocoder, with the following code, but it did not work:
"""
import geocoders
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
"""
#use csv file instead:

co_df = pd.read_csv('https://cocl.us/Geospatial_data')
print(co_df.head())
print('shape:',co_df.shape) #check the shape to make sure it matches post_df

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
shape: (103, 3)


# Match the headings of both data frames and Merge them

In [21]:
co_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
co_df.head()#check if changes applied 

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
#sort both dataframes by PostCode to make sure the line up
co_df.sort_values(by = ['PostalCode'], inplace=True)
post_df.sort_values(by = ['PostalCode'], inplace=True)

In [23]:
#merge the data frames
Toronto_df = post_df.merge(co_df, left_on='PostalCode', right_on='PostalCode', )
Toronto_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Oakridge, Golden Mile",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


In [24]:
Toronto_df.shape

(103, 5)