In [311]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes 
#!from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
print('Libraries imported.')

Libraries imported.


In [312]:
#data1 = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',skiprows=1)[0]
#print(data1)

In [313]:
import urllib.request
import bs4 as bs

In [314]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighbourhood"])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [315]:
# Delete the first row with postcode as 0
df = df.iloc[1:]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n


In [316]:
df.shape

(288, 3)

# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [317]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df.shape
df = df[~df['Borough'].isin(['Not assigned'])]
df.shape

(211, 3)

In [318]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n


In [319]:
df.sort_values('Postcode', axis=0, ascending=True, inplace=True)
# remove all '\n' values from the dataframe
df = df.replace('\n','', regex=True)

In [320]:
# rstrip all Neighbourhood column
df['Neighbourhood'] = df['Neighbourhood'].str.strip()
df['Borough'] = df['Borough'].str.strip()

In [321]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern
30,M1C,Scarborough,Port Union
29,M1C,Scarborough,Rouge Hill
28,M1C,Scarborough,Highland Creek


In [322]:
df = df.replace('\n','', regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern
30,M1C,Scarborough,Port Union
29,M1C,Scarborough,Rouge Hill
28,M1C,Scarborough,Highland Creek


# More than one neighborhood can exist in one postal code area

In [323]:
torantodf = pd.DataFrame(columns=['Postcode', 'Borough', 'Neighbourhood'])
prev_postcode = ''
prev_neighbourhood = ''
prev_borough = ''
unssigned = 'Not assigned'
for index, row in df.iterrows():
    if(row['Postcode'] == prev_postcode):
        prev_neighbourhood = prev_neighbourhood + ',' + row['Neighbourhood']
    else:
        # copy to new df
        torantodf.loc[len(torantodf)] = [prev_postcode,prev_borough ,prev_neighbourhood]
        prev_postcode = row['Postcode']
        prev_borough = row['Borough']
        prev_neighbourhood = row['Neighbourhood']
        
torantodf.shape

(103, 3)

In [324]:
# Delete the first row with postcode as blank
torantodf = torantodf.iloc[1:]
torantodf.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
8,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea"
9,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside"
10,M1N,Scarborough,"Cliffside West,Birch Cliff"


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [331]:
unssigned = 'Not assigned'
mvalue = ''
location = 0
for index, row in torantodf.iterrows():
    if(row['Neighbourhood'] == unssigned):
        row['Neighbourhood'] = row['Borough']

In [332]:
torantodf.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
8,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea"
9,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside"
10,M1N,Scarborough,"Cliffside West,Birch Cliff"


In [333]:
torantodf.shape

(102, 3)

In [334]:
#Geospatial_Coordinates.csv
# makes the passed rows header 
#df = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighbourhood"])
newdf = pd.read_csv("Geospatial_Coordinates.csv")
#newdf = pd.read_csv("Geospatial_Coordinates.csv", header = None)


In [335]:
newdf.shape

(103, 3)

In [336]:
newdf.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [337]:
#torantodf.head()
originaldf = torantodf.copy()
originaldf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [270]:
# define the dataframe columns
column_names = ['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [338]:
torantodf.sort_values('Postcode', axis=0, ascending=True, inplace=True)
newdf.sort_values('Postcode', axis=0, ascending=True, inplace=True)

In [363]:
torantodf = originaldf.copy()

In [364]:
torantodf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge,Malvern"
2,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


# Merge the Torantodf with the "Geospatial_Coordinates.csv" with postcode as the key

In [366]:

# define the dataframe columns
column_names = ['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

torantodf['g'] = torantodf.groupby('Postcode').cumcount()
newdf['g'] = newdf.groupby('Postcode').cumcount()
torantodf.merge(newdf).drop('g',1)
toranto_table = pd.merge(torantodf,newdf,on=["Postcode", 'g'],how='outer').drop('g',1)


In [370]:


# instantiate the dataframe
toranto_table.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [371]:
toranto_table.shape

(103, 5)

In [372]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toranto_table['Borough'].unique()),
        toranto_table.shape[0]
    )
)

The dataframe has 12 boroughs and 103 neighborhoods.


In [374]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

In [380]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toranto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toranto, Canada city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toranto, Canada city are 43.653963, -79.387207.


In [383]:
# create map of Toranto using latitude and longitude values
map_toranto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toranto_table['Latitude'], toranto_table['Longitude'], toranto_table['Borough'], toranto_table['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toranto)  
    
map_toranto