In [2]:
# This notebook is to build the code to scrape the Toronto postal service Wikipedia page

# download and importing libraries
!pip install BeautifulSoup4
!pip install requests
!pip install lxml

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests




In [None]:
#### Now import the dataset from the wikipedia page

In [3]:
toronto_wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(toronto_wiki).text

soup = BeautifulSoup(source, 'lxml')

In [4]:
#get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
toronto_df = pd.DataFrame(data = data,columns = columns)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
#Remove Boroughs that are 'Not assigned'

toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [19]:
# groupby
toronto_df = toronto_df.groupby('Postal Code',as_index=False).agg(lambda x: ','.join(set(x.dropna())))

# If a cell has a borough but a Not assigned neighborhood, 
#then the neighborhood will be the same as the borough
toronto_df.loc[toronto_df['Neighborhood'] == 'Not assigned','Neighborhood'] = toronto_df['Borough']

toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
#print the shape and size

print("Shape: ", toronto_df.shape)

Shape:  (103, 3)


In [27]:
#Get the latitude and the longitude coordinates of each neighborhood

toronto_coord_df = pd.read_csv('http://cocl.us/Geospatial_data')

# Merge two dataframes
df= pd.merge(toronto_df,toronto_coord_df)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [32]:
# remove the "Postal Code" column
df.drop("Postal Code", axis=1, inplace=True)
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


In [36]:
# explore the clustering of Toronto

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


address = "Toronto"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [38]:
# extract only borough that contains "Toronto"
df_denc = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_denc.head()


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


In [40]:
# explore the first neighbourhood in the toronto data
neighborhood_name = df_denc.loc[0, 'Neighborhood']
print(f"The first neighborhood's name is '{neighborhood_name}'.")

neighborhood_latitude = df_denc.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_denc.loc[0, 'Longitude'] # neighborhood longitude value

print('The Latitude and longitude of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

The first neighborhood's name is 'The Beaches'.
The Latitude and longitude of The Beaches are 43.67635739999999, -79.2930312.


In [43]:
# Now get the top 100 venues that are in The Beaches within a radius of 500 meters

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = ('https://api.foursquare.com/v2/venues/explore?client_id={}'
       '&client_secret={}&v={}&ll={},{}&radius={}&limit={}')

# get the result to a json file
results = requests.get(url).json()

In [44]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [45]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

# Filter the columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

# Clean all column names
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

In [48]:
df_denc_venues = getNearbyVenues(names=df_denc['Neighborhood'],
                                   latitudes=df_denc['Latitude'],
                                   longitudes=df_denc['Longitude']                    
df_denc_venues.head()

SyntaxError: invalid syntax (<ipython-input-48-4c56f6705b10>, line 4)