# Capstone project
## Applied Data Science Capstone

## PART 1

import the standard libraries

In [1]:
# required std-libraries
import pandas as pd
import numpy as np

get data from URL

In [2]:
import requests

data_source = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(data_source)

scrape the data with BeautifulSoup

In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')    # find the table in the html code
trs = table.find_all('tr')    # determine how many rows are in the table
rows = []
for tr in trs:                # create an array from the found rows
  i = tr.find_all('td')
  if i:
    rows.append(i)
        
lst = []
for row in rows:              # get data out from the html-code into my array
  postalcode = row[0].text.rstrip()
  borough = row[1].text.rstrip()
  neighborhood = row[2].text.rstrip()
  if borough != 'Not assigned':              # data found; if not skip and try next row
    if neighborhood == 'Not assigned':       # when neighborhood is not assigned then take the value from borough
      neighborhood = borough
    lst.append([postalcode, borough, neighborhood])  # extend list with valid data

# create a dataframe out of the list from above
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
print(df.shape)

(211, 3)


check the dataframe

In [4]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
# look for the M5A
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park


make the list with only once a PostalCode each

In [6]:
df = df.groupby('PostalCode').agg({'Borough':'first', 'Neighborhood': ', '.join,}).reset_index()

In [7]:
df.shape

(103, 3)

In [8]:
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


## PART 2

get the coordinates from the csv file and check them

In [9]:
dfgeo = pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
dfgeo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

In [10]:
dfgeo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


merge the two dataframes together

In [11]:
df_co = pd.merge(df, dfgeo, on="PostalCode", how='left')

In [12]:
df_co.loc[df_co['PostalCode'] == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [13]:
df_co.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## PART 3

import packages or install them first if an error is raised

In [14]:
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


Solving environment: done

# All requested packages already installed.



lets' get the coordinates from Toronto

In [15]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The latitude and the longitude of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The latitude and the longitude of Toronto are 43.653963, -79.387207.


create a map of Toronto using the retrieved coordinates

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_co['Latitude'], df_co['Longitude'], df_co['Borough'], df_co['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

make a breakdown and just analysing 'Toronto'

In [17]:
df_breakdown = df_co[df_co['Borough'].str.contains("Toronto")].reset_index(drop=True)
print(df_breakdown.shape)
df_breakdown

(38, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [18]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_breakdown['Latitude'], df_breakdown['Longitude'], df_breakdown['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
# let us explore the first hood "the beaches"

In [20]:
df_breakdown.loc[0, 'Neighborhood']

'The Beaches'

In [21]:
# so let's grab the neighborhood long and lat values
hood_latitude = df_breakdown.loc[0, 'Latitude']
hood_longitude = df_breakdown.loc[0, 'Longitude']

hood_name = df_breakdown.loc[0, 'Neighborhood']

print('Latitude and longitude values of {} are {}, {}.'.format(hood_name, 
                                                               hood_latitude, 
                                                               hood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [22]:
# @hidden_cell
CLIENT_ID = 'IP'
CLIENT_SECRET = 'IP'
VERSION = '20180605'

In [24]:
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Let's see what the top 100 venus are within a radius from the centroid of 500 meters
LIMIT = 100
radius = 600

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    hood_latitude, 
    hood_longitude, 
    radius, 
    LIMIT)
#url

In [25]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d43e902fd16bb002c175408'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 19,
  'suggestedBounds': {'ne': {'lat': 43.6817574054, 'lng': -79.28557885738863},
   'sw': {'lat': 43.67095739459999, 'lng': -79.30048354261137}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distance': 89,


In [26]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [27]:
#Clean json and structure into a pandas dataframe


venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,Tori's Bakeshop,Vegetarian / Vegan Restaurant,43.672114,-79.290331
2,The Beech Tree,Gastropub,43.680493,-79.288846
3,Beaches Bake Shop,Bakery,43.680363,-79.289692
4,Ed's Real Scoop,Ice Cream Shop,43.67263,-79.287993
5,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
6,Mastermind Toys,Toy / Game Store,43.671453,-79.293971
7,Beacher Cafe,Breakfast Spot,43.671938,-79.291238
8,Veloute Bistro,French Restaurant,43.672267,-79.289584
9,Xola,Mexican Restaurant,43.672603,-79.28808


Everything found is in walking distance ;-)
Nice place to be.