In [16]:
import requests
import lxml.html as lh
import bs4 as bs
import urllib.request
import numpy as np 
import pandas as pd

In [17]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = bs.BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))

In [18]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [19]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
data = data[data['Borough'] != 'Not assigned']

In [20]:
#when Neighbourhood is empty, same as Borough
data['Neighbourhood'] = np.where(data['Neighbourhood'] == 'Not assigned' , data['Borough'], data['Neighbourhood']) 


In [21]:
#More than one neighborhood join by comma

data = data.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
data.shape

(103, 3)

In [23]:
url = "https://cocl.us/Geospatial_data"
geo_data = pd.read_csv(url)

In [24]:
df_pos = pd.merge(data, geo_data, on=['Postal Code'], how='inner')

In [25]:
df_pos.head()
df_tor = df_pos[['Postal Code', 'Borough', 'Neighbourhood','Latitude', 'Longitude']].copy()

df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


In [26]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_tor['Borough'].unique()),
        df_tor.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [27]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from  geopy.geocoders import Nominatim

In [28]:
#geolocator = Nominatim()
#location = geolocator.geocode(address)
#latitude = location.latitude
#longitude = location.longitude

In [29]:
#designated location my self because geolocator is not working.... 
latitude = 43.65
longitude = -79.38

In [30]:

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto