In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


<a id='item1'></a>

Let's take a quick look at the data.

#### Use geopy library to get the latitude and longitude values of New York City.

In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>ny_explorer</em>, as shown below.

**Folium** is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

In [2]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [3]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import urllib.request as urllib2

#We will scrape data from Wikipedia.The data consists of Postcode,Borough and Neighbourhood.In the following code we read data and convert it to bs4.BeautifulSoup data.

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r=requests.get(url)
H=BeautifulSoup(r.content)
type(H)

#We first need to select the table that we'd like to scrape.As webpages contain multiple tables,we should read the table names into a list.

htmlpage=urllib2.urlopen(url)
lst=[]
for line in htmlpage:
    line=line.rstrip()
    if re.search(b'table class',line) :
        lst.append(line)

len(lst)

table=H.find('table',{'class','wikitable sortable'})
type(table)

x=lst[0]
print(x) 
extr=re.findall(b'"([^"]*)"',x)
#table=H.find('table',{'class',str(extr).strip("'[]'")})

#After stripping of the unnecessary characters we read the header and row names separately.

headers=[header.text for header in table.find_all('th')]
headers

rows=[]
for row in table.find_all('tr'):
    rows.append([val.text.encode('utf8') for val in row.find_all('td')])

df1=pd.DataFrame(rows,columns=headers)
df1.dropna(axis=0,inplace=True)
df1.head()

#We get b before every value in the table.This is because it is byte encoded.Its not a string.

df1['Borough']=df1['Borough'].str.decode("utf-8")


#df1['Neighbourhood\n'].replace(r'\\n','',regex=True,inplace=True)
#df1["Neighbourhood\n"]=df1["Neighbourhood\n"].apply(lambda x:x.replace('\\n',""))
#'][x.strip('\n') for x in df1.Neighbourhood]
#df1['Neighbourhood'].str.decode("utf-8")


df1['Postcode']=df1['Postcode'].str.decode("utf-8")
df1['Neighbourhood\n']=df1['Neighbourhood\n'].str.decode("utf-8")


df1.columns

df1.columns=[i.strip() for i in df1.columns]

df1.columns
df1['Neighbourhood']=[i.strip() for i in df1.Neighbourhood]

df1.head()

#After decoding it we get a table in the required format.

#We will remove all the rows in Borough column which have 'Not assigned' written in them.

df1.head()
df2=df1[df1['Borough'] !='Not assigned']
df2.head()
#df2.index

#We then combine those rows which have same postcodes so that the Neighbourhoods get concatenated like below.

df3=df2.Neighbourhood.groupby([df2.Postcode,df2.Borough]).apply(list).reset_index()
#df2['Neighbourhood'].groupby([df2.Postcode,df2.Borough]).apply
df3.head(100)

#After this we search in the Neighbourhood column to see if any not assigned value is there.If it is there we replace it by the corresponding value in Borough column.

df3.loc[df3.Neighbourhood =='Not assigned','Neighbourhood']=df3.Borough
#df3.loc[df3['Neighbourhood'] == ['Not assigned']]
#df3_NA=df3[df3.Neighbourhood=='Not assigned']
#df3.Neighbourhood[df3.Neighbourhood==[Not assigned]]=df3.Neighbourhood.replace([Not assigned],df3.Borough)

def replace (df,col,key,val):
    m=[v==key for v in df[col]]
    df.loc[m,col]=val
    
replace(df3,'Neighbourhood',['Not assigned'],df3.Borough)


b'<table class="wikitable sortable">'


In [4]:
DF=pd.read_csv("https://cocl.us/Geospatial_data")
DF.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
df_toronto=pd.merge(df3,DF,left_on='Postcode',right_on='Postal Code',how='right').drop('Postal Code',axis=1) 
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476


In [6]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10.7)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'],df_toronto['Borough'], df_toronto['Neighbourhood']):
    
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Toronto)  
    
map_Toronto
