### Part 1. Build datafrmae for Toronto ###

In [121]:
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup

In [122]:
# wiki page url
Toronto_postal_code_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# request wikipage 
Toronto_raw_page = requests.get(Toronto_postal_code_link)
bs=BeautifulSoup(Toronto_raw_page.text,"lxml")
table_body=bs.find('tbody')

# tabel part
rows = table_body.find_all('tr')
heads = table_body.find_all('th')

In [123]:
heads=list(heads)

In [124]:
def clean(text):
    '''clean table header for column names'''
    text=text.replace('<th>','').replace("</th>",'').replace("\n",'')
    return text

col_name = [clean(str(w)) for w in heads]
col_name

['Postcode', 'Borough', 'Neighbourhood']

In [125]:
df=pd.DataFrame(columns=col_name)
for i,row in enumerate(rows[1:]):
    cols=row.find_all('td')
    cols=[x.text.strip() for x in cols]
    df.loc[i] = cols

In [126]:
df.shape

(289, 3)

In [127]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [128]:
sorted(df['Borough'].unique())

['Central Toronto',
 'Downtown Toronto',
 'East Toronto',
 'East York',
 'Etobicoke',
 'Mississauga',
 'North York',
 'Not assigned',
 "Queen's Park",
 'Scarborough',
 'West Toronto',
 'York']

#### Remove records of "Not assigned" from column "Borogh" ####

In [129]:
df1=df[~df['Borough'].isin(['Not assigned',''])]

In [130]:
df1.shape

(212, 3)

In [131]:
df1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [132]:
df2=df1.groupby(['Postcode','Borough'])['Neighbourhood'].agg(lambda x: ','.join(x)).reset_index()

In [133]:
df2.loc[df2['Neighbourhood']=='Not assigned',['Neighbourhood']]=df2['Borough']

In [134]:
df2.shape

(103, 3)

### Part2. Get the latitude and the longitude coordinates of each neighborhood ###

link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data


In [135]:
csv_zip = pd.read_csv('http://cocl.us/Geospatial_data')

In [136]:
csv_zip.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [137]:
csv_zip.rename(columns={'Postal Code' : 'Postcode'},inplace=True)
csv_zip.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [138]:
df2_zip=df2.merge(csv_zip, on=['Postcode'],how="left")

In [139]:
df2_zip.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part3. Explore and cluster the neighborhoods in Toronto. ###
add enough Markdown cells to explain what you decided to do and to report any observations you make. <br>
generate maps to visualize your neighborhoods and how they cluster together.

In [140]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [141]:
CLIENT_ID = 'VBSBYNY2VBFL0HR4KT0ACIIXSHLZDSFT3HIRAWSW5WO4OXBW' # your Foursquare ID
CLIENT_SECRET = 'UGRKTBG241NH0K2ERHHCY3S1I3T3W1YOANUYBQW5ANU13WQH' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

In [142]:
df_Toronto = df2_zip[df2_zip['Borough'].str.contains('Toronto')]
df_Toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [143]:
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode('Toronto, ON')
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [145]:
%matplotlib inline

In [146]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to the map
markers_colors = []

# add markers to map
for lat, lng, Borough, Postcode in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Borough'],df_Toronto['Postcode']):
    label = Borough + ' (' + Postcode + ')'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto