In [2]:
# imports
import numpy as np
import pandas as pd
import requests
import os 

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [50]:
"""RESTAURANT INFORMATION"""
# import data from previous part/defining variables
bikeDF = pd.read_csv('../data/Quebec_BikeCity.csv')

# creating variables for the request
FOURSQUARE_KEY = os.environ['FOURSQUARE_API_KEY']
url = "https://api.foursquare.com/v3/places/search"

# one list that will contain a bunch of dictionaries
FScontainerBARS = []

# loops through all the different bike stations, grabbing all the restaurants/bars within a 1000m radius
for index, row in bikeDF.iterrows():
    lat = row['Latitude']
    long = row['Longitude']

    # set parameters so it grabs specific information on bar/restuarants
    params = {
        'll': f'{lat},{long}',
        'categoryId': '13003,13065',  # ID for dining and drinking
        "radius": '1000',
        'fields': 'name,location,categories,distance,rating'
    }
    headers = {
        "Accept": "application/json",
        "Authorization": FOURSQUARE_KEY
    }

    # gather the data and turn it into a JSON
    request = requests.get(url, params=params, headers=headers)


    data = request.json()

    # create a more refined JSON which goes into the 'results' key (this will make it easier to call specific elements later)
    resultsJSON = data['results']
    # turn the refined JSON into a dataframe(easier to grab info from)
    resultsDF = pd.DataFrame(resultsJSON)

    # #if the bar/restaurant has a rating, save it as a var
    # #if the bar/restaurant has no rating, set it a null/None
    try:
        resultsDF['rating']
    except:
        BarRating = 0
    else:
        BarRating = resultsDF['rating']


    # add a dictionary, containing all the relavent info, into the empty list
    # each entry in the list corresponds to a singular bike station
    FScontainerBARS.append({
        'Latitude': lat,
        'Longitude': long,
        'Rest/Bar Names': resultsDF['name'],
        'Distance': resultsDF['distance'],
        'Rest/Bar Rating': BarRating
    })


Repeat process to get all gyms within a 1000m of each bike station

In [4]:
"""GYM INFORMATION"""

#one list that will contain a bunch of dictionaries
FScontainerGYMS =[]

#loops through all the different bike stations, grabbing all the gyms within a 1000m radius
for index, row in bikeDF.iterrows():
    lat = row['Latitude']
    long = row['Longitude']

    # set parameters so it grabs specific information on gyms
    params = {
        'll': f'{lat},{long}',
        'categories': 18021,  # ID for gyms and studios
        "radius": '1000',
        'fields': 'name,location,categories,distance'
    }
    headers = {
        "Accept": "application/json",
        "Authorization": FOURSQUARE_KEY
    }

    #gather the data and turn it into a JSON
    request = requests.get(url, params=params, headers=headers)
    data=request.json()

    #create a more refined JSON which goes into the 'results' key (this will make it easier to call specific elements later)
    resultsJSON = data['results']
    #turn the refined JSON into a dataframe(easier to grab info from)
    resultsDF = pd.DataFrame(resultsJSON)
    


    #if there is a gym within a 1km radious, then save the names and distances
    #if there is no gym in the area, set name as null and distance as 0 (easier for calculations and spotting outliers later)
    if len(resultsJSON) > 0:
        GYMname =resultsDF['name']
        GYMdistance = resultsDF['distance']
    else:
        GYMname = None
        GYMdistance = 0,0 #double zero so we can use lambda later when calculating averages
    
    #add a dictionary, containing all the relavent info, into the empty list
    #each entry in the list corresponds to a singular bike station
    FScontainerGYMS.append({
            'Latitude': lat,
            'Longitude': long,
            'Gym Names': GYMname,
            'Distance': GYMdistance
            })


Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [None]:
#requests and parsing of response had been completed in prior step/cell

Put your parsed results into a DataFrame

In [53]:
#Parsing the information into dataframes
FSbar_DF = pd.DataFrame(FScontainerBARS)
FSgym_DF = pd.DataFrame(FScontainerGYMS)

FSbar_DF
FSgym_DF

Unnamed: 0,Latitude,Longitude,Gym Names,Distance
0,46.786588,-71.258231,0 Centre de Yoga Ste...,0 660 1 672 2 680 3 807 4 668 5...
1,46.784041,-71.249391,0 Centre d'Entraînement Privé Pierr...,"0 140 1 137 2 149 Name: distance, dty..."
2,46.829433,-71.244066,0 Centre de Yoga Qué...,0 892 1 262 2 421 3 703 4 765 5...
3,46.812403,-71.220411,0 Planete Fit...,0 460 1 394 2 248 3 537 4 604 5...
4,46.826553,-71.245978,0 Danz Québec 1 Ba...,0 381 1 625 2 674 3 754 Name: dist...
...,...,...,...,...
69,46.814340,-71.224898,,"(0, 0)"
70,46.816606,-71.241658,0 Énergie Cardio 1 Pi...,0 890 1 467 2 697 3 827 4 501 5...
71,46.783205,-71.276189,0 Stationnement du PEPS 1 ...,0 114 1 592 2 618 3 872 4 958 5...
72,46.811548,-71.235388,0 L'École de danse d...,0 333 1 478 2 676 3 738 4 ...


Transform, join and clean Foursquare data

In [54]:
#Transforming data to get average distances
FSbar_DF['Average Distance_Bar'] = FSbar_DF['Distance'].apply(lambda x: sum(x)/len(x))
FSgym_DF['Average Distance_Gym'] = FSgym_DF['Distance'].apply(lambda x: sum(x)/len(x))

#left join 
FOURSQUARE_DF = pd.merge(FSbar_DF,FSgym_DF, on=['Latitude','Longitude'], how='left')

#cleaning data 
FOURSQUARE_DF.drop(columns=['Rest/Bar Names', 'Distance_x', 'Gym Names','Distance_y'], inplace=True)
FOURSQUARE_DF

Unnamed: 0,Latitude,Longitude,Rest/Bar Rating,Average Distance_Bar,Average Distance_Gym
0,46.786588,-71.258231,0 8.1 1 NaN 2 8.5 3 7.0 4 7.7 5...,711.3,777.125000
1,46.784041,-71.249391,0 7.8 1 8.5 2 6.6 3 7.7 4 7.0 5...,355.2,142.000000
2,46.829433,-71.244066,0 8.1 1 NaN 2 NaN 3 NaN 4 6.0 5...,302.2,701.857143
3,46.812403,-71.220411,0 9.1 1 8.6 2 8.5 3 8.9 4 9.0 5...,376.9,568.800000
4,46.826553,-71.245978,0 8.1 1 9.1 2 NaN 3 NaN 4 NaN 5...,395.1,608.500000
...,...,...,...,...,...
69,46.814340,-71.224898,0 8.6 1 8.5 2 9.0 3 8.8 4 8.9 5...,345.0,0.000000
70,46.816606,-71.241658,0 8.6 1 7.9 2 7.8 3 8.6 4 9.2 5...,663.5,688.571429
71,46.783205,-71.276189,0 7.6 1 NaN 2 8.4 3 8.1 4 7.1 5...,602.4,678.500000
72,46.811548,-71.235388,0 8.6 1 9.2 2 8.0 3 8.5 4 9.1 5...,223.1,674.300000


In [55]:
#Saving the dataframe as a .csv

#setting path
path = "../data/FOURSQUARE_DF.csv"

#saving
FOURSQUARE_DF.to_csv(path, index=False)

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [9]:
"""BARS AND RESTAURANTS"""
#creating variables for the request
YELP_KEY = os.environ['YELP_API_KEY']
url = "https://api.yelp.com/v3/businesses/search"

#one list that will contain a bunch of dictionaries
YELPcontainerBARS =[]

#loops through all the different bike stations, grabbing all the restaurants/bars within a 1000m radius
for index, row in bikeDF.iterrows():
    lat = row['Latitude']
    long = row['Longitude']

    params = {
        'latitude': lat,
        'longitude': long,
        'radius': '1000',
        'categories': 'restaurants,bars',
    }
    headers = {
        'Authorization': 'Bearer '+ YELP_KEY
    }

    response = requests.get(url, params=params, headers=headers)
    YELPdata = response.json()


    #create a more refined JSON which goes into the 'busisnesses' key (this will make it easier to call specific elements later)
    YELPJSON = YELPdata['businesses']
    
    #turn the refined JSON into a dataframe(easier to grab info from)
    resultsDF = pd.DataFrame(YELPJSON)

    # #if the bar/restaurant has a rating, save it as a var
    # #if the bar/restaurant has no rating, set it a null/None
    try:
        resultsDF['rating']
    except:
        BarRating = 0
    else:
        BarRating = resultsDF['rating']

    #add a dictionary, containing all the relavent info, into the empty list
    #each entry in the list corresponds to a singular bike station
    YELPcontainerBARS.append({
        'Latitude': lat,
        'Longitude': long,
        'Names':resultsDF['name'] ,
        'Distance': resultsDF['distance'],
        'REST/BAR RATING': BarRating
        })
    

Repeat proccess to find all gyms within 1000m

In [14]:
"""GYM INFORMATION"""
#creating variables for the request

#one list that will contain a bunch of dictionaries
YELPcontainerGYM =[]

#loops through all the different bike stations, grabbing all the restaurants/bars within a 1000m radius
for index, row in bikeDF.iterrows():
    lat = row['Latitude']
    long = row['Longitude']

    params = {
        'latitude': lat,
        'longitude': long,
        'radius': '1000',
        'categories': 'gyms',
    }
    headers = {
        'Authorization': 'Bearer '+ YELP_KEY
    }

    response = requests.get(url, params=params, headers=headers)
    YELPdata = response.json()


    #create a more refined JSON which goes into the 'busisnesses' key (this will make it easier to call specific elements later)
    YELPJSON = YELPdata['businesses']

    #turn the refined JSON into a dataframe(easier to grab info from)
    resultsDF = pd.DataFrame(YELPJSON)


    #if there is a gym within a 1km radious, then save the names and distances
    #if there is no gym in the area, set name as null and distance as 0 (easier for calculations and spotting outliers later)
    if len(YELPJSON) > 0:
        GYMname =resultsDF['name']
        GYMdistance = resultsDF['distance']
    else:
        GYMname = None
        GYMdistance = 0,0 #double zero so we can use lambda later when calculating averages
    

    #add a dictionary, containing all the relavent info, into the empty list
    #each entry in the list corresponds to a singular bike station
    YELPcontainerGYM.append({
        'Latitude': lat,
        'Longitude': long,
        'Names': GYMname,
        'Distance': GYMdistance,
        })
    

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [None]:
#requests and parsing of response had been completed in prior step/cell

Put your parsed results into a DataFrame

In [21]:
#Parsing the information into dataframes
YELPbar_DF = pd.DataFrame(YELPcontainerBARS)
YELPgym_DF = pd.DataFrame(YELPcontainerGYM)

YELPbar_DF
YELPgym_DF

Unnamed: 0,Latitude,Longitude,Names,Distance,REST/BAR RATING
0,46.786588,-71.258231,0 Montego 1 ...,0 1037.587652 1 882.052080 2 825...,0 4.5 1 4.5 2 4.5 3 4.0 4 ...
1,46.784041,-71.249391,0 Pizzéria No 900 Napolitaine 1 Boulan...,0 374.013162 1 223.246940 2 510.25...,0 4.5 1 4.5 2 4.5 3 4.0 4 ...
2,46.829433,-71.244066,0 Casa Calzone 1 Resto...,0 807.191123 1 486.594258 2 614...,0 5.0 1 4.5 2 5.0 3 5.0 4 ...
3,46.812403,-71.220411,0 Le Hobbit 1 ...,0 225.058013 1 342.196217 2 355.84...,0 4.0 1 4.5 2 5.0 3 4.0 4 ...
4,46.826553,-71.245978,0 Casa Calzone 1 ...,0 573.017690 1 702.032581 2 749...,0 5.0 1 5.0 2 4.5 3 4.0 4 ...
...,...,...,...,...,...
69,46.814340,-71.224898,0 Le Café du Clocher Penché 1...,0 301.802695 1 154.451566 2 152...,0 4.5 1 4.0 2 4.0 3 4.5 4 ...
70,46.816606,-71.241658,0 Patente et Machin 1 ...,0 866.148530 1 1025.214432 2 641...,0 4.5 1 4.5 2 5.0 3 4.5 4 ...
71,46.783205,-71.276189,0 Café au Temps Perdu 1 ...,0 772.434578 1 936.832294 2 607.98...,0 4.0 1 5.0 2 3.5 3 3.5 4 ...
72,46.811548,-71.235388,0 Patente et Machin 1...,0 187.017841 1 137.076666 2 44...,0 4.5 1 4.5 2 4.0 3 4.5 4 ...


Transform, join and clean Yelp data

In [56]:
#Transforming data to get average distances
YELPbar_DF['Average Distance_Bar'] = YELPbar_DF['Distance'].apply(lambda x: sum(x)/len(x))
YELPgym_DF['Average Distance_Gym'] = YELPgym_DF['Distance'].apply(lambda x: sum(x)/len(x))

#left join 
YELP_DF = pd.merge(YELPbar_DF,YELPgym_DF, on=['Latitude','Longitude'], how='left')

#cleaning data 
YELP_DF.drop(columns=['Names_x', 'Distance_x', 'Names_y','Distance_y'], inplace=True)
YELP_DF

Unnamed: 0,Latitude,Longitude,REST/BAR RATING,Average Distance_Bar,Average Distance_Gym
0,46.786588,-71.258231,0 4.5 1 4.5 2 4.5 3 4.0 4 ...,877.117260,0.000000
1,46.784041,-71.249391,0 4.5 1 4.5 2 4.5 3 4.0 4 ...,334.688704,0.000000
2,46.829433,-71.244066,0 5.0 1 4.5 2 5.0 3 5.0 4 ...,775.248808,960.440552
3,46.812403,-71.220411,0 4.0 1 4.5 2 5.0 3 4.0 4 ...,526.095304,596.684360
4,46.826553,-71.245978,0 5.0 1 5.0 2 4.5 3 4.0 4 ...,772.152088,0.000000
...,...,...,...,...,...
69,46.814340,-71.224898,0 4.5 1 4.0 2 4.0 3 4.5 4 ...,395.755111,727.229924
70,46.816606,-71.241658,0 4.5 1 4.5 2 5.0 3 4.5 4 ...,854.998115,0.000000
71,46.783205,-71.276189,0 4.0 1 5.0 2 3.5 3 3.5 4 ...,663.152084,551.620940
72,46.811548,-71.235388,0 4.5 1 4.5 2 4.0 3 4.5 4 ...,502.543619,0.000000


In [20]:
#Saving the dataframe as a .csv

#setting path
path = "../data/YELP_DF.csv"

#saving
YELP_DF.to_csv(path, index=False)

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

Based on results, Yelps offers higher quality data as Fourspuare's API may potentially restrict the number of results returned. For a vast majority of the bike stations within the Foursquare dataset, the number of restaurants returned appears to be retricted to 10 instances whereas no apparent resctrictions seem to be present within the Yelp API. The abundance of information generated by the Yelp API provides a better foundation for statistical analysis.

Note: There appears to be a discrepancy between average gym distance between the two datasets. Despite both APIs having 46 stations with no gyms close by, they are not coordinate. For instance, in index 43 (Frasier/Cartier) there are no local gyms within the Foursquare dataset but there appears to be one within the Yelp dataset. Further examples may be seen within the joining_data.ipynb notebook.

In [47]:
#only 9 locations
FSbar_DF['Rest/ Bar Rating'][0]

0    8.1
1    NaN
2    8.5
3    7.0
4    7.7
5    NaN
6    8.2
7    8.1
8    7.8
9    NaN
Name: rating, dtype: float64

In [44]:
YELPbar_DF['REST/BAR RATING'][0]

0     4.5
1     4.5
2     4.5
3     4.0
4     4.0
5     4.0
6     4.5
7     4.5
8     4.5
9     3.5
10    3.5
11    3.0
12    3.5
13    3.5
14    4.0
15    5.0
16    4.5
17    3.5
18    3.5
19    4.0
Name: rating, dtype: float64

In [22]:
FOURSQUARE_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Latitude              74 non-null     float64
 1   Longitude             74 non-null     float64
 2   REST/BAR RATING       74 non-null     object 
 3   Average Distance_Bar  74 non-null     float64
 4   Average Distance_Gym  74 non-null     float64
dtypes: float64(4), object(1)
memory usage: 3.0+ KB


In [30]:
#46 stations with no local gyms across both dfs
YELP_DF[YELP_DF['Average Distance_Gym'] == 0].count()

Latitude                46
Longitude               46
REST/BAR RATING         46
Average Distance_Bar    46
Average Distance_Gym    46
dtype: int64

In [34]:
FOURSQUARE_DF[FOURSQUARE_DF['Average Distance_Gym'] == 0].count()

Latitude                46
Longitude               46
REST/BAR RATING         46
Average Distance_Bar    46
Average Distance_Gym    46
dtype: int64

Get the top 10 restaurants according to their rating

In [57]:
"""FOURSQUARE"""
#empty lists which will later be used for columns in the dataframe
names = []
ratings = []

#un-nest the FS dataframes and fill lists
station=0
while station < len(FSbar_DF['Rest/Bar Rating']):
    names.append(FSbar_DF['Rest/Bar Names'][station].to_list()) 
    ratings.append(FSbar_DF['Rest/Bar Rating'][station])
    station +=1
    
#list of all restaurants in a flattened list
NamesList= list(np.concatenate(names))
NamesList

#extracting info out of each the series in the ratings variable, turning it into a flattened list
flattened_ratings = []
for series in ratings:
    try:
        flattened_ratings.extend(series) # Attempt to iterate over the series
    except TypeError:
        flattened_ratings.append(series) # If TypeError occurs, treat the element as a single item

#creating a DF with our cleaned data
RESTAURANTSwithRATINGS = pd.DataFrame(NamesList, columns=['Restaurants'])
RESTAURANTSwithRATINGS['Ratings'] = flattened_ratings

#removing duplicates (31 duplicates)
RESTAURANTSwithRATINGScleaned = RESTAURANTSwithRATINGS.drop_duplicates()

#top 10 restaurants
RESTAURANTSwithRATINGScleaned.sort_values('Ratings', ascending=False).head(10)


Unnamed: 0,Restaurants,Ratings
130,Terrasse Dufferin,9.3
107,Plains of Abraham (Plaines d'Abraham),9.3
111,Restaurant le Saint-Amour,9.2
171,Mille et une Pizzas,9.2
381,Le Bouchon du Pied Bleu,9.2
173,Restaurant Légende,9.1
30,Cantook Micro Torréfaction,9.1
167,Soupe & Cie,9.1
385,Patente et Machin,9.1
41,Videotron Center,9.1


Note: Foursquare utilizes a 0-10 rating system whereas Yelp utilizes a 0-5 system
Notice the top 10 restaurants vary depending which dataset is used. As the Yelp API contains more information, most notably more restaurants surrounding each station, it likely contains information on highly rated restaurants not present within the Foursquare API; the locations in the Yelp results may simply be newer locations with fewer, higher star rated, reviews

In [49]:
"""YELP"""
#empty lists which will later be used for columns in the dataframe
names = []
ratings = []

#un-nest the FS dataframes and fill lists
station=0
while station < len(YELPbar_DF['REST/BAR RATING']):
    names.append(YELPbar_DF['Names'][station].to_list()) 
    ratings.append(YELPbar_DF['REST/BAR RATING'][station])
    station +=1
    
#list of all restaurants in a flattened list
NamesList= list(np.concatenate(names))
NamesList

#extracting info out of each the series in the ratings variable, turning it into a flattened list
flattened_ratings = []
for series in ratings:
    try:
        flattened_ratings.extend(series) # Attempt to iterate over the series
    except TypeError:
        flattened_ratings.append(series) # If TypeError occurs, treat the element as a single item

#creating a DF with our cleaned data
restaurants_with_ratings = pd.DataFrame(NamesList, columns=['Restaurants'])
restaurants_with_ratings['Ratings'] = flattened_ratings

#removing duplicates (31 duplicates)
restaurants_with_ratingscleaned = restaurants_with_ratings.drop_duplicates()

#top 10 restaurants
restaurants_with_ratingscleaned.sort_values('Ratings', ascending=False).head(10)


Unnamed: 0,Restaurants,Ratings
58,Frite Alors!,5.0
244,Le Mezzé Taverna,5.0
42,Chez Carlos Café,5.0
43,Sous-Marins le Marinier,5.0
328,Le Trèfle Limoilou,5.0
326,Le Ket'Chose,5.0
48,Nguyen,5.0
49,Izgara,5.0
317,La Taqueria,5.0
51,Poulet Rouge Limoilou,5.0
