# This Project is explores Toronto Neighbourhoods with Foursquare API

### Part 1

In [1]:
#installing libraries
!pip install beautifulsoup4
!pip install geopy
!pip install folium
print('Libraries installed!')

Libraries installed!


In [2]:
#importing libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
import json

import folium
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

print('Libraries imported!')

Libraries imported!


## 1) Scraping Wikipedia to get the table

In [0]:
# open Wiki page with Beautiful Soup

data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [0]:
# Scraping HTML to obtain table

postal = []
borough = []
neighbourhood = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postal.append(cells[0].text)
        borough.append(cells[1].text)
        neighbourhood.append(cells[2].text.rstrip('\n')) # remove the new line char from neighborhood cell

In [5]:
# Creating dataframe with corresponding column names from the scraped data

toronto_neighbourhood = [('PostalCode', postal),
                      ('Borough', borough),
                      ('Neighbourhood', neighbourhood)]
df_toronto = pd.DataFrame.from_dict(dict(toronto_neighbourhood))
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# Ignoring cells with Borough that is Not Assigned. 

df_toronto = df_toronto[df_toronto.Borough != 'Not assigned'].reset_index(drop=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [7]:
# Grouping Neighborhood having same Postal code, separated by commas.

df_toronto = df_toronto.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
# Replacing the 'Not Assigned' neighbourhood with the value of corresponding borough.

not_assigned_neigh_rows = df_toronto.Neighbourhood == 'Not assigned'
df_toronto.loc[not_assigned_neigh_rows, 'Neighbourhood'] = df_toronto.loc[not_assigned_neigh_rows, 'Borough']
df_toronto[not_assigned_neigh_rows]

Unnamed: 0,PostalCode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [9]:
# Printing out number of rows in dataframe.

df_toronto.shape

(103, 3)

# 2) Loading coordinates from csv and adding to DataFrame

In [10]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')
coordinates = pd.read_csv('toronto_coordinates.csv')

Coordinates downloaded!


In [11]:
print(coordinates.shape)
coordinates.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# good! we found out that we have the same shape for both data...

# Merging 2 datasets and setting indices of two dataframes to its Postal Code columns

df_toronto_temp = df_toronto.set_index('PostalCode')
coordinates_temp = coordinates.set_index('Postal Code')
toronto_complete = pd.concat([df_toronto_temp, coordinates_temp], axis=1, join='inner')
toronto_complete.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
# Resetting index to obtain the toronto dataframe with coordinates

toronto_complete.index.name = 'PostalCode'
toronto_complete.reset_index(inplace=True)
print(toronto_complete.shape)
toronto_complete.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
