In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
Libraries imported.


In [3]:
# To extract from Wikipedia page
from bs4 import BeautifulSoup
url=  requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#with open (url) as html_file:
data = BeautifulSoup(url.text,'lxml')
#print(data.prettify())


In [4]:
#parse data and save to csv file
import csv
csv_file=open('canada.csv','w')
csv_writer=csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])
for tr in data.find_all('tr')[1:]:
    tds = tr.find_all('td')
    if len(tds)==3:
        Postcode=tds[0].text
        Borough=tds[1].text
        Neighbourhood=tds[2].text
    #print(Postcode, Borough, Neighbourhood)
    csv_writer.writerow([Postcode, Borough, Neighbourhood])
csv_file.close()

In [5]:
#load canada data from CSV
import pandas as pd
df_Canada=pd.read_csv('canada.csv')
print('Data loaded')

Data loaded


In [6]:
#removes \n at the end
df_Canada[df_Canada.columns] = df_Canada.apply(lambda x: x.str.strip('\n'))
df_Canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
# exclude not assigned borough 
df_Canada_Assigned=df_Canada[df_Canada.Borough!='Not assigned']
df_Canada_Assigned.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [8]:
#Group neighbourhoods by postal code
df_Canada_Assigned.set_index(['Postcode','Borough'],inplace=True)
result = df_Canada_Assigned.groupby(level=['Postcode','Borough'], sort=False).agg( ','.join)
result.reset_index() # to remove set index

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [9]:
#remove not assignet neighbourhood
result.loc[result['Neighbourhood'] =='Not assigned']=result[result['Neighbourhood'] == 'Not assigned'].index.values[0][1]
result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront,Regent Park"
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Queen's Park


In [10]:
#make a dataframe for canada
df_Canada_final=result.reset_index()
df_Canada_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [11]:
df_Canada_final.shape 

(103, 3)

In [12]:
#format address to be searchable in google maps
df_Canada_final["Address"]=df_Canada_final['Borough'] +", " +df_Canada_final['Postcode'] 
df_Canada_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,"North York, M3A"
1,M4A,North York,Victoria Village,"North York, M4A"
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A"
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A"
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A"


ASIGNMENT PART 2

In [13]:
#READ GEO DATA
geoData=pd.read_csv("https://cocl.us/Geospatial_data")
geoData

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [15]:
# CONCAT DATA AND GEODATA
geoReferenced = pd.concat([df_Canada_final, geoData], axis=1)
geoReferenced.drop('Postal Code', axis=1, inplace=True)
geoReferenced

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Latitude,Longitude
0,M3A,North York,Parkwoods,"North York, M3A",43.806686,-79.194353
1,M4A,North York,Victoria Village,"North York, M4A",43.784535,-79.160497
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A",43.763573,-79.188711
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A",43.770992,-79.216917
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A",43.773136,-79.239476
5,M9A,Etobicoke,Islington Avenue,"Etobicoke, M9A",43.744734,-79.239476
6,M1B,Scarborough,"Rouge,Malvern","Scarborough, M1B",43.727929,-79.262029
7,M3B,North York,Don Mills North,"North York, M3B",43.711112,-79.284577
8,M4B,East York,"Woodbine Gardens,Parkview Hill","East York, M4B",43.716316,-79.239476
9,M5B,Downtown Toronto,"Ryerson,Garden District","Downtown Toronto, M5B",43.692657,-79.264848


In [16]:
geoReferenced.shape

(103, 6)