In [4]:
!pip install beautifulsoup4
!pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/ab/97/25def417bf5db4cc6b89b47a56961b893d4ee4fec0c335f5b9476a8ff153/geopy-1.22.0-py2.py3-none-any.whl (113kB)
[K     |████████████████████████████████| 122kB 7.2MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.22.0


In [5]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library


## 1. Getting data from wikipedia

In [6]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

## 2. Parsing data with Beautiful Soup

In [7]:
data_new = BeautifulSoup(data, 'html.parser')

In [8]:
PostalCode = []
Borough =[]
Neighborhood = []

data_new


<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"652df336-7a6c-4f10-8854-52123f8436fb","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario-rel

In [9]:
for i in data_new.find('table').find_all('tr'):
    cells = i.find_all('td')
    if(len(cells)>0):
        PostalCode.append(cells[0].text.rstrip('\n'))
        Borough.append(cells[1].text.rstrip('\n'))
        Neighborhood.append(cells[2].text.rstrip('\n'))

## 3. Creating a dictionary and converting to pd df

In [10]:
dict_1 = {'Postal Code':PostalCode,'Borough':Borough,'Neighborhood':Neighborhood}
dict_1

{'Postal Code': ['M1A',
  'M2A',
  'M3A',
  'M4A',
  'M5A',
  'M6A',
  'M7A',
  'M8A',
  'M9A',
  'M1B',
  'M2B',
  'M3B',
  'M4B',
  'M5B',
  'M6B',
  'M7B',
  'M8B',
  'M9B',
  'M1C',
  'M2C',
  'M3C',
  'M4C',
  'M5C',
  'M6C',
  'M7C',
  'M8C',
  'M9C',
  'M1E',
  'M2E',
  'M3E',
  'M4E',
  'M5E',
  'M6E',
  'M7E',
  'M8E',
  'M9E',
  'M1G',
  'M2G',
  'M3G',
  'M4G',
  'M5G',
  'M6G',
  'M7G',
  'M8G',
  'M9G',
  'M1H',
  'M2H',
  'M3H',
  'M4H',
  'M5H',
  'M6H',
  'M7H',
  'M8H',
  'M9H',
  'M1J',
  'M2J',
  'M3J',
  'M4J',
  'M5J',
  'M6J',
  'M7J',
  'M8J',
  'M9J',
  'M1K',
  'M2K',
  'M3K',
  'M4K',
  'M5K',
  'M6K',
  'M7K',
  'M8K',
  'M9K',
  'M1L',
  'M2L',
  'M3L',
  'M4L',
  'M5L',
  'M6L',
  'M7L',
  'M8L',
  'M9L',
  'M1M',
  'M2M',
  'M3M',
  'M4M',
  'M5M',
  'M6M',
  'M7M',
  'M8M',
  'M9M',
  'M1N',
  'M2N',
  'M3N',
  'M4N',
  'M5N',
  'M6N',
  'M7N',
  'M8N',
  'M9N',
  'M1P',
  'M2P',
  'M3P',
  'M4P',
  'M5P',
  'M6P',
  'M7P',
  'M8P',
  'M9P',
  'M1R',
  'M

In [11]:
tor_df = pd.DataFrame(data=dict_1)

In [12]:
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [13]:
cols = ['Postal Code','Borough','Neighborhood']
tor_df[cols] = tor_df[cols].replace({'\n':''}, regex=True)

In [14]:
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 5. Dropping rows with unassigned Boroughs

In [15]:
tor_df.drop(tor_df[tor_df['Borough']=='Not assigned'].index, inplace = True) 

In [16]:
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
tor_df=tor_df.reset_index()

In [18]:
del tor_df['index']

In [19]:
tor_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## 6. Giving Neighborhood's the same name as Borough if not assigned 

In [20]:
for i, j in tor_df.iterrows():
    if j["Neighborhood"] == "Not assigned":
        j["Neighborhood"] = j["Borough"]
        
tor_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [21]:
tor_df['Postal Code'].value_counts()

M4G    1
M4M    1
M1L    1
M1W    1
M1K    1
      ..
M2L    1
M6H    1
M6N    1
M3L    1
M9A    1
Name: Postal Code, Length: 103, dtype: int64

## 7. Groupby to remove duplicates of PostalCode

In [22]:
tor_df_grouped = tor_df.groupby(['Postal Code', 'Borough'], as_index=False).agg(lambda x: ", ".join(x))
tor_df_grouped

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


## 8. Clean Data

In [23]:
tor_df_grouped['Postal Code'].value_counts()

M4G    1
M5B    1
M1J    1
M3C    1
M7A    1
      ..
M9M    1
M1X    1
M6H    1
M6N    1
M9A    1
Name: Postal Code, Length: 103, dtype: int64

In [26]:
column_names = ["Postal Code", "Borough", "Neighborhood"]
test = pd.DataFrame(columns=column_names)

test_1 = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_1:
    test = test.append(tor_df_grouped[tor_df_grouped["Postal Code"]==postcode], ignore_index=True)
    
test

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [29]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
tor_df_merged = tor_df_grouped.merge(coordinates, on="Postal Code", how="left")
tor_df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [32]:
tor_df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [34]:
column_names = ["Postal Code", "Borough", "Neighborhood","Latitude","Longitude"]
test = pd.DataFrame(columns=column_names)

test_1 = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_1:
    test = test.append(tor_df_merged[tor_df_merged["Postal Code"]==postcode], ignore_index=True)
    
test

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
