Join the data from Part 1 with the data from Part 2 to create a new dataframe.

In [233]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os # use this to access your environment variables
import requests # this will be used to call the APIs
from unidecode import unidecode # remove accents from venues e.g. café --> cafe
import re # regex search for strings


FOURSQUARE_KEY = os.getenv('FOURSQUARE_API_KEY')
YELP_KEY = os.getenv('YELP_API_KEY')

# you can print to confirm both API keys have been imported successfully from Terminal

In [161]:
bike_stations = pd.read_csv("../data/bike_stations_sat_3pm_clean.csv")

**This bike_stations data is based on the Saturday, 3pm citybikes API call**

In [163]:
bike_stations

Unnamed: 0,id,name,latitude,longitude,free_bikes,empty_slots,bike_availability,ll
0,fb337bbed72e2be090071e199899b2be,Queen St E / Woodward Ave,43.665269,-79.319796,18,1,94.74,"43.665269,-79.319796"
1,4ff88d5880e71aa40d34cfe5d09b0ca7,Primrose Ave / Davenport Rd,43.671420,-79.445947,1,14,6.67,"43.67142,-79.445947"
2,a09c67c0b419654d907c9134b108e328,Queen St E / Rhodes Ave,43.666224,-79.317693,12,11,52.17,"43.666224,-79.317693"
3,d6a9daee68070a8b106cfb598d81308c,Bond St / Queen St E,43.653236,-79.376716,5,32,13.51,"43.653236,-79.376716"
4,8f8af40d9388c8a3962559e8681d3db7,Church St / Alexander St,43.663722,-79.380288,3,32,8.57,"43.663722,-79.380288"
...,...,...,...,...,...,...,...,...
821,9be5f078a1ed47fc11cd3cee45260f63,Kennedy Rd/Ranstone Gdns (Jack Goodlad Park),43.741906,-79.271819,1,10,9.09,"43.741906,-79.271819"
822,4ae37f3bddfb819954a15143d277dbd9,Eglinton Ave E / Brimley Rd,43.736953,-79.247984,8,11,42.11,"43.736953,-79.247984"
823,e7968ab22d9a15db0673f463144428eb,College Park South,43.659457,-79.382365,7,11,38.89,"43.659457,-79.382365"
824,62acc308c0f93ff09d28e06c73afc3ec,165 McRae Dr,43.705875,-79.368006,4,11,26.67,"43.705875,-79.368006"


In [104]:
# testing the ability to parse for POI category, number of POIs for a sample API response, need the non-cleaned version
venue_df = pd.read_csv("../data/test_venue_df.csv")

In [269]:
sample_bike_df = bike_stations.sample(n=8)

In [271]:
sample_bike_df

Unnamed: 0,id,name,latitude,longitude,free_bikes,empty_slots,bike_availability,ll
467,2904c293698dc248004b5f43e1a28f80,Sherbourne St / Wellesley St E,43.6673,-79.374,1,29,3.33,"43.6673,-79.374"
599,3a73375ca8c3f3fc85b7b32ec080be5e,Greenwood Ave / Sammon Ave,43.686987,-79.334782,1,10,9.09,"43.686987,-79.334782"
90,dad51db43939490092b81d8cea72e018,Dufferin St / Sylvan Av (Dufferin Grove Park),43.655556,-79.433611,4,14,22.22,"43.655556,-79.433611"
587,ec3b9fd9e808fc3e1819e802f02ea442,Avenue Rd / Cumberland St,43.669822,-79.394446,8,4,66.67,"43.669822,-79.394446"
386,0faf9e7c7d5403b8f7dc7574fa6d0c6e,Roehampton Ave / Mount Pleasant Rd,43.70942,-79.39139,9,9,50.0,"43.70942,-79.39139"
202,fb0ff514e9bb7798f0a682f36f644b05,Sumach St / Queen St E,43.656638,-79.358749,5,13,27.78,"43.656638,-79.358749"
414,9d145fd59618d68dab06dba1c8900fe9,Bloor St W / Riverside Dr,43.647663,-79.487583,5,18,21.74,"43.647663,-79.487583"
334,65aefd1c1484ebd15a7827285971734e,Dawes Rd / Taylor Creek Trl,43.696631,-79.297436,11,8,57.89,"43.696631,-79.297436"


**Strategy is as follows:
Steps for the 10 stations in sample_bike_df, ensure it works for the 820 rows in the whole city bike dataset**:

1. Define my get_venues_fs function to call the Foursquare API. Set it up so that it can take a concatenated ll string, like the ll column of bike_stations and sample_bike_df.
2. Initialize a new bike_stations df with the columns to fill in for # of POIs, and # of POIs in certain categories.
3. Loop through each bike station, make the get_venues_fs call with its ll column to find venue results with the data in step 2. 
4. With the returned results, we'll parse for the number of POI categories, and the number of POIs generally (know that the upper limit of returns is 50, radius of the call is 800m to make <=50 results meaningful)
5. We should have a dataframe returned containing 1) the original ll call which we can use to join the bike_stations table later if we wish, 2) the number of POIs and number of establishments within POI categories, to train the model

Notes:

**I am doing step 2 here and not in yelp_foursquare_EDA to avoid bringing city_bikes data into the yelp_foursquare notebook, so any cross-referencing of city bikes with venue data will happen here**.

**In the likely event that some bike station ll's will retrieve establishments that were retrieved by another bike station (e.g. in Downtown Toronto where several stations will be within 800m of each other), I am retrieving the fsq_id for the establishments which will be checked for dupes, with dupes removed, before removing the column in cleaning to join it with a cleaned bike_stations df**

In [171]:
# Step 1 from Strategy: Defining the foursquare API Call here

def get_venues_fs(ll, radius, api_key, categories, limit):
    """
    Get amenities and POIs from Foursquare API call
    Args:
        ll (str): concatenated latitude,longitude that is used to supply the API and return POIs
        radius: number marking the radius in metres for the POIs to be collected from
        api_key (str): foursquare API to use for query (imported above)
        categories (str) : Foursquare-recognized place type. four categories provided at bottom of this cell
        limit (int): will be given as 50, the maximum for this API
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.foursquare.com/v3/places/search"
    
    headers = {
        "Accept": "application/json",
        "Authorization": api_key
    }
    
    params = {
        "ll": ll,
        "radius": radius,
        "categories": categories,
        "limit": limit
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200: # 200 is success
        return response.json()
    else:
        response.raise_for_status()

categories = '10035,13003,13065,16000' # Category codes - bars, restaurants, live shows, outdoors


In [273]:
# Step 2: get the bike station dataframe ready for the Foursquare API data to fill in in step 3

sample_bike_df['n_pois'] = 0
sample_bike_df['n_bar_restaurant'] = 0
sample_bike_df['n_cafe'] = 0
sample_bike_df['n_live'] = 0
sample_bike_df['n_park'] = 0
sample_bike_df

Unnamed: 0,id,name,latitude,longitude,free_bikes,empty_slots,bike_availability,ll,n_pois,n_bar_restaurant,n_cafe,n_live,n_park
467,2904c293698dc248004b5f43e1a28f80,Sherbourne St / Wellesley St E,43.6673,-79.374,1,29,3.33,"43.6673,-79.374",0,0,0,0,0
599,3a73375ca8c3f3fc85b7b32ec080be5e,Greenwood Ave / Sammon Ave,43.686987,-79.334782,1,10,9.09,"43.686987,-79.334782",0,0,0,0,0
90,dad51db43939490092b81d8cea72e018,Dufferin St / Sylvan Av (Dufferin Grove Park),43.655556,-79.433611,4,14,22.22,"43.655556,-79.433611",0,0,0,0,0
587,ec3b9fd9e808fc3e1819e802f02ea442,Avenue Rd / Cumberland St,43.669822,-79.394446,8,4,66.67,"43.669822,-79.394446",0,0,0,0,0
386,0faf9e7c7d5403b8f7dc7574fa6d0c6e,Roehampton Ave / Mount Pleasant Rd,43.70942,-79.39139,9,9,50.0,"43.70942,-79.39139",0,0,0,0,0
202,fb0ff514e9bb7798f0a682f36f644b05,Sumach St / Queen St E,43.656638,-79.358749,5,13,27.78,"43.656638,-79.358749",0,0,0,0,0
414,9d145fd59618d68dab06dba1c8900fe9,Bloor St W / Riverside Dr,43.647663,-79.487583,5,18,21.74,"43.647663,-79.487583",0,0,0,0,0
334,65aefd1c1484ebd15a7827285971734e,Dawes Rd / Taylor Creek Trl,43.696631,-79.297436,11,8,57.89,"43.696631,-79.297436",0,0,0,0,0


In [275]:
# Step 3:

# lists for the augmented bike_stations for the model - number of POIs, bars/restaurants/live venues, parks to populate columns for the model training
n_poi_list = [] # this is solved by n_poi_list.append(len(res['results']))
n_restobar_list = []
n_live_list = []
n_park_list = []
n_cafe_list = []

# lists for the venues returned via the API call

fsq_id_list = []
ll_list = []
name_list = []
address_list = []
category_list = []

# set up regex patterns for finding # of bar/restaurant, cafes, parks, live venues to search within the category column. These are loose 'contain' matches
bar_restaurant_pattern = re.compile(r'\b(bar|restaurant|lounge|bbq|pub|grill|burger|chicken|diner|pizzeria|tavern|night club|nightclub)\b', re.IGNORECASE)
live_venue_pattern = re.compile(r'\b(concert|music venue|music|comedy|live|theater)\b', re.IGNORECASE)
park_pattern = re.compile(r'\b(park|playground|monument|plaza)\b', re.IGNORECASE)
cafe_pattern = re.compile(r'\b(cafe|coffee|coffee shop|tea|bakery|donut|deli)\b', re.IGNORECASE)


In [277]:
# Step 3:

for index, row in sample_bike_df.iterrows():
    n_restobar = 0 # I will iterate over each venue returned for the bike station's coordinates and ticker up for a positive match to the regex
    n_live = 0
    n_park = 0
    n_cafe = 0
    lat_long = row['ll']
    
    res = get_venues_fs(ll=lat_long, radius=800, api_key=FOURSQUARE_KEY, categories=categories, limit=50)
    n_poi_list.append(len(res['results'])) # the number of POIs is simply length of the results, no need to ticker.

    print("I got here!")
    
    for venue in res['results']:
        category_name = venue['categories'][0]['name'] # capturing the first category name in order to regex check it for one of our major categories of interest
        category_name = unidecode(category_name)
        
        if bar_restaurant_pattern.search(category_name):
            n_restobar += 1
        elif live_venue_pattern.search(category_name):
            n_live += 1
        elif park_pattern.search(category_name):
            n_park += 1
        elif cafe_pattern.search(category_name):
            n_cafe += 1

    n_restobar_list.append(n_restobar)
    n_live_list.append(n_live)
    n_park_list.append(n_park)
    n_cafe_list.append(n_cafe)
    

I got here!
I got here!
I got here!
I got here!
I got here!
I got here!
I got here!
I got here!


In [279]:
print(n_poi_list)
print(n_restobar_list)
print(n_live_list)
print(n_park_list)
print(n_cafe_list)

#print(f"{row['name']}, {row['ll']}, {row['n_pois']}") - just a spare print for the cell above

[48, 28, 50, 50, 50, 50, 33, 18]
[28, 20, 33, 32, 38, 27, 18, 9]
[2, 1, 2, 3, 0, 4, 1, 0]
[6, 3, 2, 3, 3, 4, 6, 7]
[5, 3, 10, 8, 4, 10, 5, 0]


In [282]:
sample_bike_df['n_pois'] = n_poi_list

In [284]:
sample_bike_df

Unnamed: 0,id,name,latitude,longitude,free_bikes,empty_slots,bike_availability,ll,n_pois,n_bar_restaurant,n_cafe,n_live,n_park
467,2904c293698dc248004b5f43e1a28f80,Sherbourne St / Wellesley St E,43.6673,-79.374,1,29,3.33,"43.6673,-79.374",48,0,0,0,0
599,3a73375ca8c3f3fc85b7b32ec080be5e,Greenwood Ave / Sammon Ave,43.686987,-79.334782,1,10,9.09,"43.686987,-79.334782",28,0,0,0,0
90,dad51db43939490092b81d8cea72e018,Dufferin St / Sylvan Av (Dufferin Grove Park),43.655556,-79.433611,4,14,22.22,"43.655556,-79.433611",50,0,0,0,0
587,ec3b9fd9e808fc3e1819e802f02ea442,Avenue Rd / Cumberland St,43.669822,-79.394446,8,4,66.67,"43.669822,-79.394446",50,0,0,0,0
386,0faf9e7c7d5403b8f7dc7574fa6d0c6e,Roehampton Ave / Mount Pleasant Rd,43.70942,-79.39139,9,9,50.0,"43.70942,-79.39139",50,0,0,0,0
202,fb0ff514e9bb7798f0a682f36f644b05,Sumach St / Queen St E,43.656638,-79.358749,5,13,27.78,"43.656638,-79.358749",50,0,0,0,0
414,9d145fd59618d68dab06dba1c8900fe9,Bloor St W / Riverside Dr,43.647663,-79.487583,5,18,21.74,"43.647663,-79.487583",33,0,0,0,0
334,65aefd1c1484ebd15a7827285971734e,Dawes Rd / Taylor Creek Trl,43.696631,-79.297436,11,8,57.89,"43.696631,-79.297436",18,0,0,0,0


In [None]:
# Json normalize!!!

Provide a visualization that you used as part of your EDA process. Explain the initial pattern or relationship you discoved through this visualization. 

# Database

Put all your results in an SQLite3 database (remember, SQLite stores its databases as files in your local machine - make sure to create your database in your project's data/ directory!)

Look at the data before and after the join to validate your data.