# This notebook will be used for Capstone project

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Week 3 - Applied Data Science Capstone

Problem 1: The code below is used for the Toronto neighborhood clustering exercise

In [3]:
from bs4 import BeautifulSoup
import requests

In [4]:
#Fetching the data from the given website
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [5]:
soup = BeautifulSoup(source, 'lxml')
# print(soup.prettify())

In [6]:
table = soup.find('table')
# print(table.prettify())

In [7]:
# columns = list()
# for headings in table.tr.find_all('th'):
#     print(headings.text)
#     columns.append(headings.text)

In [8]:
# Data Cleaning
data = list()
for rows in table.find_all('tr'):
    
    row = rows.find_all('td')
    if row:
        postalcode = row[0].text.rstrip()
        borough = row[1].text.rstrip()
        neighborhood = row[2].text.rstrip()
        if borough != 'Not assigned':
            if neighborhood == 'Not assigned':
                neighborhood = borough
            data.append([postalcode, borough, neighborhood])

col_head = list()
for cols in table.tr.find_all('th'):
    col_head.append(cols.text.strip())

In [9]:
col_head

['Postcode', 'Borough', 'Neighbourhood']

In [10]:
# convert into a dataframe
df = pd.DataFrame(data, columns = col_head)
print(df.shape)

(210, 3)


In [11]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,210,210,210
unique,103,10,208
top,M9V,Etobicoke,St. James Town
freq,8,45,2


In [12]:
# custom groupby to merge Neighbourhoods
df = df.groupby('Postcode').agg(
    {
        'Borough':'first', 
        'Neighbourhood': ', '.join,}
    ).reset_index()

In [13]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,103,103,103
unique,103,10,103
top,M3C,North York,"The Kingsway, Montgomery Road, Old Mill North"
freq,1,24,1


In [14]:
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
df.shape

(103, 3)

Problem 2: Fetching Latitude & Longitude information

In [26]:
!wget -q -O geospatial_data.csv http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [29]:
dfgeo = pd.read_csv("geospatial_data.csv")
dfgeo.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
dfgeo.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

In [32]:
dfgeo.head(5)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [45]:
df2 = pd.merge(df, dfgeo, on="Postcode", how='left')

In [49]:
df2.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Problem 3: Clustering

In [51]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Zid1295\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |   py37hc8dfbb8_1         149 KB  conda-forge
    conda-4.8.3                |   py37hc8dfbb8_0         3.1 MB  conda-forge
    python_abi-3.7             |          1_cp37m           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

  python_abi         conda-forge/win-64::python_abi-3.7-1_cp37m

The following packages will be UPDATED:

  certifi                                 2019.11.28-py37_0 --> 2019.11.28-py37hc8dfbb8_1
  conda                 

In [52]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


In [58]:
# create map of Canada using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_toronto)  
    
map_toronto

## Explore Toronto Neighbourhoods

In [130]:
#Define Foursquare Credentials and Version
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [62]:
#Select only Toronto Neighborhoods
df3 = df2[df2['Borough'].str.contains('Toronto')]

df4 = df3.reset_index(drop=True)
df4.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [68]:
# Map of Toronto Neighbourhoods only

In [66]:
# create map of Toronto using latitude and longitude values
map_tohood = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_tohood)  
    
map_tohood

In [69]:
# First neighborhood
df4.loc[0, 'Neighbourhood']

'The Beaches'

In [73]:
neighbourhood_lat = df4.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_long = df4.loc[0, 'Longitude'] # neighbourhood longitude value
neighbourhood_name = df4.loc[0, 'Neighbourhood'] # neighbourhood name

In [74]:
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_lat, 
    neighbourhood_long, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=SHDYIWCOFJACIXFQEROWPJIVR53J2CLDKZKZ5R5FB1G0EJ10&client_secret=XEW4NI4NHBMC5U5YW3A2ZDE4A2E21COL0NZVOMQF0I5BKZ45&v=20180604&ll=43.67635739999999,-79.2930312&radius=500&limit=30'

In [75]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e6df53b6001fe001b9b8762'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [109]:
'There are {} places to explore near "{}" neighborhood.'.format(len(results['response']['groups'][0]['items']),neighbourhood_name)

'There are 4 places to explore near "The Beaches" neighborhood.'

In [76]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [116]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [117]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


In [129]:
# create map of Toronto using the selected neighbourhood latitude and longitude values
map_tohood = folium.Map(location=[neighbourhood_lat, neighbourhood_long], zoom_start=16)

# add a red circle marker to represent the selected neighborhood
folium.features.CircleMarker(
    [neighbourhood_lat, neighbourhood_long],
    radius=12,
    color='red',
    popup= neighbourhood_name,
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(map_tohood)


# add markers to map
for lat, lng, name, categories in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories']):
    label = '{}, {}'.format(name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3).add_to(map_tohood)  

map_tohood