<h1> Explore and cluster the neighbourhoods in toronto </h1>

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library


<h1> Parse the Wikipedia page </h1>

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

<h2> 
Find the first table on the Wikipedia page and iterate through tags for required information </h2>

In [3]:
table_can_zipinfo = soup.find('table')
colvals = table_can_zipinfo.find_all('td')

elem_cnt = len(colvals)

postcode = []
borough = []
neighborhood = []

for i in range(0, elem_cnt, 3):
    postcode.append(colvals[i].text.strip())
    borough.append(colvals[i+1].text.strip())
    neighborhood.append(colvals[i+2].text.strip())

<h1> Build the dataframe from the list of values </h1>

In [4]:
df_can_postcode = pd.DataFrame(data=[postcode, borough, neighborhood]).transpose()
df_can_postcode.columns = ['Postcode', 'Borough', 'Neighborhood']

<h1> Cleanse the data and transform as per the given requirements </h1>

In [5]:
df_can_postcode.drop(df_can_postcode[df_can_postcode['Borough'] == 'Not assigned'].index, inplace=True)
df_can_postcode.loc[df_can_postcode.Neighborhood == 'Not assigned', "Neighborhood"] = df_can_postcode.Borough

<h1> Group the data by Postcode & Borough </h1>

In [6]:
df_grp_can = df_can_postcode.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_grp_can.columns = ['Postcode', 'Borough', 'Neighborhood']


<h3> Read the Geospatial csv file and inner join it with df_grp_can. </h3>

In [7]:
df_latlng = pd.read_csv('http://cocl.us/Geospatial_data')
df_latlng.columns = ['Postcode', 'Latitude', 'Longitude']

In [8]:
df_join = pd.merge(df_grp_can, df_latlng, on=['Postcode'], how='inner')

<h3> Explore and cluster the neighborhoods in Toronto. </h3>

In [9]:
neighborhoods = df_join[['Borough', 'Neighborhood', 'Latitude', 'Longitude']].copy()
neighborhoods.head(5)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


In [10]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [11]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [12]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

finding entries using foursquare

In [15]:
import json
import requests
from pandas.io.json import json_normalize 

CLIENT_ID = 'R53KA41YY4PR1GFMFNJPB4SZ1XTNEBPPCBXFJTGTZBK1TLVT' # your Foursquare ID
CLIENT_SECRET = 'DCRG2AJM2W1YXTARXXQZISLLFFULS2JCOBN4RQRGUPGCV4FP' # your Foursquare Secret
VERSION = '20180605'
venue_name = []
venue_categories = []
venue_location_lat = []
venue_location_lng = []
n = []

for i in neighborhoods.index:
  lat = neighborhoods.loc[i,"Latitude"]
  lng = neighborhoods.loc[i,"Longitude"]
  url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, 750, 50)
  results = requests.get(url).json()
  #print(neighborhoods.loc[i, "Neighborhood"])
  try:
    data = results['response']['groups'][0]['items']
    for j in data:
      n.append(neighborhoods.loc[i,"Neighborhood"])
      venue_name.append(j['venue']['name'])
      venue_location_lat.append(j['venue']['location']['lat'])
      venue_location_lng.append(j['venue']['location']['lng'])
      venue_categories.append(j['venue']['categories'][0]['name'])

  except:
     print(neighborhoods.loc[i, "Neighborhood"] + "Unsuccessful")
print(len(n))
print(len(venue_name))
print(len(venue_location_lat))
print(len(venue_location_lng))
print(len(venue_categories))

2660
2660
2660
2660
2660
