In [2]:
# Importing packages

import requests
import json
import time #will use this to pause execution for a few seconds

import numpy as np
import pandas as pd

In [12]:
# Extracting Google Key
from configparser import ConfigParser
config = ConfigParser()
config.read('config.ini')
# extract key from a separate ini file where the key is stored
API_KEY = config['google']['api_key']

In [13]:
endpoint_url_start = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"
# new api request string for new type of API requests
endpoint_url_start_2 = "https://maps.googleapis.com/maps/api/place/details/json?place_id="

In [14]:
# common places will be searched in many small radius circles
places_common = ['bakery', 'cafe', 'restaurant', 'meal_delivery', 'meal_takeaway', 'bar', 'convenience store',\
                'gas_station', 'supermarket', 'department_store', "tourist_attraction", "lodging"]
# uncommon places will be searched using Galveston as a whole
places_uncommon = ['airport', 'amusement_park', 'aquarium', 'casino', 'hospital',\
                'stadium', 'university', 'primary_school',\
                'school', 'secondary_school', 'zoo', 'movie_theater', 'shopping mall', "bowling_alley"]
# Creating dataframes to be filled
# dataframe for common types of places
aa_common = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status',\
                                    'types', 'delivery',\
                                    'dine_in', 'takeout', 'price_level', 'rating', 'user_ratings_total',\
                                    'serves_breakfast', 'serves_brunch','serves_lunch', 'serves_dinner',\
                                    'serves_vegetarian_food', 'serves_beer', 'serves_wine'])
# dataframe for uncommon types of places
aa_uncommon = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status',\
                                    'types', 'delivery',\
                                    'dine_in', 'takeout', 'price_level', 'rating', 'user_ratings_total',\
                                    'serves_breakfast', 'serves_brunch','serves_lunch', 'serves_dinner',\
                                    'serves_vegetarian_food', 'serves_beer', 'serves_wine'])
# Basic parameters for Ann Arbor as a whole
central_cord = "42.27583,-83.72916"
central_radius = "6026"
# Basic parameters for Ann Arbor as a whole
# centers of small radius circles I drew (see map screenshot for visualization of the small circles)
# small circles are from a website called "Calc Maps"
central_cord_small = [
    '42.29183,-83.78533', "42.29262,-83.76311", "42.27619,-83.79102", "42.28483,-83.77217", "42.27743,-83.77544"
    "42.26433,-83.79250", "42.26813,-83.77474", "42.28393,-83.76212", "42.27742,-83.76180", "42.29528,-83.74858",
    "42.28693,-83.75346", "42.28225,-83.75356", "42.28668,-83.74568", "42.28236,-83.74665", "42.28594,-83.73908",
    "42.28216,-83.73989", "42.27809,-83.74424", "42.27719,-83.75150", "42.26933,-83.76450", "42.27131,-83.75431",
    "42.26213,-83.76767", "42.29101,-83.73816", "42.28549,-83.73173", "42.29786,-83.73619", "42.27261,-83.74438",
    "42.26602,-83.75018", "42.26397,-83.75857", "42.25996,-83.77666", "42.27868,-83.73334", "42.27404,-83.73620",
    "42.25554,-83.76850", "42.26404,-83.73859", "42.25740,-83.74855", "42.25350,-83.75897", "42.30520,-83.77322",
    "42.29295,-83.72629", "42.30828,-83.73641", "42.30209,-83.72027", "42.24375,-83.74880", "42.25585,-83.73173",
    "42.31724,-83.71867", "42.24357,-83.72240", "42.31596,-83.69756", "42.30322,-83.70329", "42.29770,-83.69019",
    "42.28771,-83.71124", "42.27862,-83.72350", "42.26568,-83.72530", "42.27217,-83.70706", "42.26409,-83.71338",
    "42.25407,-83.71374", "42.23999,-83.70190", "42.31097,-83.67866", "42.28064,-83.69264", "42.26554,-83.69190",
    "42.25288,-83.69343", "42.28672,-83.68026", "42.27306,-83.67951", "42.23533,-83.68890"
]
# radius of each small circles I drew (see map screenshot for visualization of the small circles)
central_radius_small = [
    "1115", "747.45", "986.50", "533.43", "741.28", "1097", "619.23", "421.37", "629.36", "779.98", "383.65",
    "384.39", "347.37", "365.95", "238.89", "394.38", "220.20", "428.04", "562.29", "355.07", "420.07",
    "439.57", "413.89", "361.46", "528.77", "461.63", "570.84", "533.13", "455.62", "734.74", "550.11",
    "622.89", "685.18", "803.62", "1077", "656.64", "1053", "1084", "1397", "738.70", "1225", "1202",
    "1154", "800.99", "1174", "1084", "939.34", "588.51", "719.39", "950.97", "1178", "743.55", "884.65",
    "1011", "923.34", "1227", "462.10", "1185", "1150"
]


In [15]:
# function to append result to the dataframe
# Takes care of null returned from API request
# see API nearby search documentation for inside "" content
def append_place(json_results, df):
    l = len(df)
    for i, val in enumerate(json_results['results']):
        df.loc[l + i] = [val.get('name', np.nan), 
                        val.get('place_id', np.nan),
                        val.get('vicinity', np.nan), 
                        val.get('geometry', np.nan).get('location', np.nan),
                        val.get('business_status', np.nan),
                        val.get('types', np.nan),
                        val.get('delivery', np.nan),
                        val.get('dine_in', np.nan),
                        val.get('takeout', np.nan),
                        val.get('price_level', np.nan),
                        val.get('rating', np.nan),
                        val.get('user_ratings_total', np.nan),
                        val.get('serves_breakfast', np.nan),
                        val.get('serves_brunch', np.nan),
                        val.get('serves_lunch',np.nan),
                        val.get('serves_dinner', np.nan),
                        val.get('serves_vegetarian_food', np.nan),
                        val.get('serves_beer', np.nan),
                        val.get('serves_wine', np.nan)]


In [16]:
def small_circle_scrape():
    # for each small circle
    # search for all detailed info we need
    # then add the results to the big dataframe
    for i in range(0, len(central_cord_small)):
        for _type in places_common:
            endpoint_url = endpoint_url_start + "&location=" + central_cord_small[i] + "&radius=" + \
            central_radius_small[i] + "&region=us&type=" + _type + "&key=" + API_KEY

            params = {}

            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            # use the function we wrote before to append place to the galv_common df
            append_place(results, aa_common)

            # set a sleep time so that we won't burn the API tool
            time.sleep(1)

            # Pulling results from other pages
            while "next_page_token" in results:
                params['pagetoken'] = results['next_page_token']
                res = requests.get(endpoint_url, params = params)
                results =  json.loads(res.content)
                append_place(results, aa_common)
                time.sleep(1)
    return aa_common

aa_common = small_circle_scrape()
# drop duplicates
aa_common.drop_duplicates(subset=['place_id'], inplace=True)
# recode the category for each place
aa_common['category'] = aa_common['types'].apply(lambda x: x[0])

In [17]:
def large_circle_scrape():
    # code doing the actual scraping of API
    aa_uncommon_full = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status',\
                                        'types', 'delivery',\
                                        'dine_in', 'takeout', 'price_level', 'rating', 'user_ratings_total',\
                                        'serves_breakfast', 'serves_brunch','serves_lunch', 'serves_dinner',\
                                        'serves_vegetarian_food', 'serves_beer', 'serves_wine'])
    for i in range(5):
        for _type in places_uncommon:
            endpoint_url = endpoint_url_start + "&location=" + central_cord + "&radius=" + \
            central_radius + "&region=us&type=" + _type + "&key=" + API_KEY
            
            params = {}
            
            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            append_place(results, aa_uncommon)
            
            time.sleep(1)
            
            # Pulling results from other pages since Googel only display 20 results per page
            while "next_page_token" in results:
                params['pagetoken'] = results['next_page_token']
                res = requests.get(endpoint_url, params = params)
                results =  json.loads(res.content)
                append_place(results, aa_uncommon)
                time.sleep(1)
        aa_uncommon_full = pd.concat([aa_uncommon_full, aa_uncommon])
    return aa_uncommon_full

aa_uncommon_full = large_circle_scrape()
# weirdly google API does not return the same number of results for each type of place
# so we run the code for 5 times to get a more complete list
# then we drop the duplicates
aa_uncommon_full.drop_duplicates(subset=['place_id'], inplace=True)
# recode the category of each place
aa_uncommon_full['category'] = aa_uncommon_full['types'].apply(lambda x: x[0])

In [19]:
#remove_rows_from_other_cities:
# since the radius also includes cities outside of Ann Arbor, we need to filter for Ann Arbor locations only
# for the dataframe of uncommon types of locations
# first reset the index
aa_uncommon_full.reset_index(drop=True, inplace=True)
for i in range(0, len(aa_uncommon_full)):
    string = aa_uncommon_full['Address'][i]
    if "Ann Arbor" not in string:
        aa_uncommon_full.drop(labels=i, axis=0, inplace=True)

# since the radius also includes cities outside of Ann Arbor, we need to filter for Ann Arbor locations only
# for the dataframe of common types of locations
# first reset the index
aa_common.reset_index(drop=True, inplace=True)
for i in range(0, len(aa_common)):
    string = aa_common['Address'][i]
    if "Ann Arbor" not in string:
        aa_common.drop(labels=i, axis=0, inplace=True)


In [20]:
# Creating a list of unique place ids that we can use in Google Places Details API search to get business hours
aa_common_ids = np.unique(aa_common['place_id'])
aa_uncommon_ids = np.unique(aa_uncommon_full['place_id'])
# creating dataset to fill with business hours
aa_common_hours = pd.DataFrame(columns=['Name','place_id','business_hours', 'business_hours_text'])
aa_uncommon_hours = pd.DataFrame(columns=['Name','place_id','business_hours','business_hours_text'])

In [21]:
def append_place_2(json_results, df):
    l = len(df)
    val = json_results['result']
    if len(val) == 2:
        df.loc[l + 1] = [val.get('name', np.nan), 
                         val.get('place_id', np.nan),
                         np.nan,
                         np.nan]
    else:
        df.loc[l + 1] = [val.get('name', np.nan), 
                         val.get('place_id', np.nan),
                         val.get('opening_hours', np.nan).get('periods', np.nan),
                         val.get('opening_hours', np.nan).get('weekday_text', np.nan)]

In [22]:
def small_circle_detail_scrape():
    # creating a business hours dataset for the common places
    for place_id in aa_common_ids:
        endpoint_url = endpoint_url_start_2 + place_id + \
        "&fields=name%2Cplace_id%2Copening_hours&key=" + API_KEY
        
        params = {}
        
        res = requests.get(endpoint_url, params = params)
        results =  json.loads(res.content)
        append_place_2(results, aa_common_hours)
        
        time.sleep(1)
    return aa_common_hours

aa_common_hours = small_circle_detail_scrape()

In [23]:
def large_circle_detail_scrape():
    # creating a business hours dataset for the uncommon places
    for place_id in aa_uncommon_ids:
        endpoint_url = endpoint_url_start_2 + place_id + \
        "&fields=name%2Cplace_id%2Copening_hours&key=" + API_KEY
        
        params = {}
        
        res = requests.get(endpoint_url, params = params)
        results =  json.loads(res.content)
        append_place_2(results, aa_uncommon_hours)
        
        time.sleep(1)
    return aa_uncommon_hours

aa_uncommon_hours = large_circle_detail_scrape()

In [24]:
# merge distribution search and detial search
def merge_dist_detail(aa_common, aa_uncommon_full, aa_common_hours, aa_uncommon_hours):
    # merging previous datasets with business hours datasets
    aa_common = aa_common.merge(aa_common_hours, how='left', left_on=['Name', 'place_id'], right_on=['Name', 'place_id'])
    aa_uncommon_full = aa_uncommon_full.merge(aa_uncommon_hours, how='left', left_on=['Name', 'place_id'], right_on=['Name', 'place_id'])
    return aa_common, aa_uncommon_full

aa_common, aa_uncommon = merge_dist_detail(aa_common, aa_uncommon_full, aa_common_hours, aa_uncommon_hours)

In [25]:
# The initial string for the api
endpoint_url_start = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?"

# Specifying the types of places in a list
# common places will be searched in many small radius circles
clct_common = ['bus_station', 'light_rail_station', 'parking', \
              'taxi_stand', 'train_station', 'transit_station']
# uncommon places will be searched using Galveston as a whole
clct_uncommon = ['art_gallery', 'campground', 'car_rental', 'museum', 'night_club',\
                'spa', 'travel_agency']

# Creating dataframes to be filled
# dataframe for common types of places
aa_clct_common = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status','types'])
# dataframe for uncommon types of places
aa_clct_uncommon = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status','types'])


In [26]:
def append_clct_place(json_results, df):
    l = len(df)
    for i, val in enumerate(json_results['results']):
        df.loc[l + i] = [val.get('name', np.nan), 
                         val.get('place_id', np.nan),
                         val.get('vicinity', np.nan), 
                         val.get('geometry', np.nan).get('location', np.nan),
                         val.get('business_status', np.nan),
                         val.get('types', np.nan)]

In [27]:
def residential_large_circle_scrape():
    # code doing the actual scraping of API
    aa_clct_uncommon_full = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status','types'])
    # since there is randomization in the API, we will run the code 5 times to get a more complete dataset
    for i in range(5):
        for _type in clct_uncommon:
            endpoint_url = endpoint_url_start + "&location=" + central_cord + "&radius=" + \
            central_radius + "&region=us&type=" + _type + "&key=" + API_KEY
            
            params = {}
            
            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            append_clct_place(results, aa_clct_uncommon)
            
            
            time.sleep(1)
            
            # Pulling results from other pages since Googel only display 20 results per page
            while "next_page_token" in results:
                params['pagetoken'] = results['next_page_token']
                res = requests.get(endpoint_url, params = params)
                results =  json.loads(res.content)
                append_clct_place(results, aa_clct_uncommon)
                time.sleep(1)
        aa_clct_uncommon_full = pd.concat([aa_clct_uncommon_full, aa_clct_uncommon])
    return aa_clct_uncommon_full

aa_clct_uncommon_full = residential_large_circle_scrape()
# drop duplicates
aa_clct_uncommon_full.drop_duplicates(subset=['place_id'], inplace=True)
aa_clct_uncommon_full.reset_index(drop=True, inplace=True)
# recode category for each place
aa_clct_uncommon_full['category'] = aa_clct_uncommon_full['types'].apply(lambda x: x[0])

In [28]:
def residential_small_circle_scrape():
    # for each small circle
    # search for all detailed info we need
    # then add the results to the big dataframe
    for i in range(0, len(central_cord_small)):
        for _type in clct_common:
            endpoint_url = endpoint_url_start + "&location=" + central_cord_small[i] + "&radius=" + \
            central_radius_small[i] + "&region=us&type=" + _type + "&key=" + API_KEY

            params = {}

            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            # use the function we wrote before to append place to the galv_common df
            append_clct_place(results, aa_clct_common)


            time.sleep(1)

            # Pulling results from other pages
            while "next_page_token" in results:
                params['pagetoken'] = results['next_page_token']
                res = requests.get(endpoint_url, params = params)
                results =  json.loads(res.content)
                append_clct_place(results, aa_clct_common)
                time.sleep(1)
    return aa_clct_common

aa_clct_common = residential_small_circle_scrape()
aa_clct_common.drop_duplicates(subset=['place_id'], inplace=True)
aa_clct_common.reset_index(drop=True, inplace=True)
# recode category for each place
aa_clct_common['category'] = aa_clct_common['types'].apply(lambda x: x[0])

In [29]:
# since the radius also includes cities outside Ann Arbor, we need to filter for Ann Arbor locations only
# for the dataframe of uncommon types of locations
for i in range(0, len(aa_clct_uncommon_full)):
    string = aa_clct_uncommon_full['Address'][i]
    if "Ann Arbor" not in string:
        aa_clct_uncommon_full.drop(labels=i, axis=0, inplace=True)

# for the dataframe of common types of locations
for i in range(0, len(aa_clct_common)):
    string = aa_clct_common['Address'][i]
    if "Ann Arbor" not in string:
        aa_clct_common.drop(labels=i, axis=0, inplace=True)

In [30]:
# Below is keyword search for residential areas
# residential places types
clct_residential = ['apartment', 'condominium', 'townhouse']
# dataframe for residential places
aa_resid = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status','types'])

In [33]:
def residential_scrape():
    # code doing the actual scraping of API
    aa_resid_full = pd.DataFrame(columns=['Name','place_id', 'Address','Coordinates', 'business_status','types'])
    for i in range(5):
        for _keyword in clct_residential:
            endpoint_url = endpoint_url_start + "&location=" + central_cord + "&radius=" + \
            central_radius + '&keyword=' + _keyword + "&key=" + API_KEY
            
            params = {}
            
            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            append_clct_place(results, aa_resid)
            
            
            time.sleep(2)
            
            # Pulling results from other pages since Googel only display 20 results per page
            while "next_page_token" in results:
                params['pagetoken'] = results['next_page_token']
                res = requests.get(endpoint_url, params = params)
                results =  json.loads(res.content)
                append_clct_place(results, aa_resid)
                time.sleep(2)
        aa_resid_full = pd.concat([aa_resid_full, aa_resid])
    return aa_resid_full

aa_resid_full = residential_scrape()
aa_resid_full.drop_duplicates(subset=['place_id'], inplace=True)
aa_resid_full.reset_index(drop=True, inplace=True)
aa_resid_full['category'] = aa_resid_full['types'].apply(lambda x: x[0])

In [34]:
# since the radius also includes cities outside of Ann Arbor, we need to filter for Ann Arbor locations only
# for the dataframe of uncommon types of locations
for i in range(0, len(aa_resid_full)):
    string = aa_resid_full['Address'][i]
    if "Ann Arbor" not in string:
        aa_resid_full.drop(labels=i, axis=0, inplace=True)

In [35]:
aa_common.to_csv('aa_common.csv')
aa_uncommon.to_csv('aa_uncommon.csv')
aa_clct_common.to_csv('aa_clct_common.csv')
aa_clct_uncommon.to_csv('aa_clct_uncommon.csv')
aa_resid.to_csv('aa_resid.csv')

In [109]:
# Extracting Yelp Key
from configparser import ConfigParser
config = ConfigParser()
config.read('config.ini')

API_KEY = "Bearer "  + config['yelp']['api_key']

In [110]:
# The url accessing to the Yelp Business Details API
endpoint_url_start = "https://api.yelp.com/v3/businesses/search?"

# Initial the dataframe to store business details info
aa_locations = pd.DataFrame(columns=['Name','Address','Coordinates',"id"])

In [111]:
# Get the number of businesses in Ann Arbor
res = requests.get(url=endpoint_url_start,
                        params = {"location":"Ann Arbor, MI"},
                        headers={"Authorization":API_KEY}
                        )
response = json.loads(res.content)
total_aa = response["total"]

In [112]:
print("There are " + str(total_aa) + " businesses in Ann Arbor")

There are 1100 businesses in Ann Arbor


In [113]:
print(response)

{'businesses': [{'id': 'uTMqhmpgfpDMLN3W3YvMeQ', 'alias': 'frita-batidos-ann-arbor', 'name': 'Frita Batidos', 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL52xtE-3WvhGr74Irw/o.jpg', 'is_closed': False, 'url': 'https://www.yelp.com/biz/frita-batidos-ann-arbor?adjust_creative=kl8jOau67NZclbVQDYPhNQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=kl8jOau67NZclbVQDYPhNQ', 'review_count': 2145, 'categories': [{'alias': 'cuban', 'title': 'Cuban'}, {'alias': 'burgers', 'title': 'Burgers'}], 'rating': 4.5, 'coordinates': {'latitude': 42.2803651, 'longitude': -83.7491532}, 'transactions': ['delivery', 'pickup'], 'price': '$$', 'location': {'address1': '117 W Washington St', 'address2': '', 'address3': '', 'city': 'Ann Arbor', 'zip_code': '48104', 'country': 'US', 'state': 'MI', 'display_address': ['117 W Washington St', 'Ann Arbor, MI 48104']}, 'phone': '+17347612882', 'display_phone': '(734) 761-2882', 'distance': 1243.5784044949269}, {'id': 'yNIYH9041m1JEyRS

In [114]:
# function to append the business detailed info to the dataframe
# Takes care of null returned from API request
def append_place(response, df):
    l = len(df)
    for i, val in enumerate(response["businesses"]):
        df.loc[l + i] = [val.get('name', np.nan), 
                         val.get('location', np.nan).get("display_address"), 
                         val.get('coordinates', np.nan),
                         val.get("id",np.nan)]

In [115]:
# Use the function I defined above, I extract the info from the Yelp API and store them into the dataframe I made.
# total_aa is the total number of businesses.
location = {"Ann Arbor, MI": [aa_locations, total_aa]}
# Go through Galveston and Hilo
for i,v in location.items():
    # Here, I got 20 business info each time until I got all business in a city. 
    # The reason is that one page of Yelp API contains 20 businesses so I cannot get more than 20 business details each time. 
    for t in range(0, 1000, 20):
        res = requests.get(url=endpoint_url_start,
                        params = {"location":i, "offset":t},
                        headers={"Authorization":API_KEY}
                        )
        response = json.loads(res.content)
        append_place(response, v[0])

In [86]:
# in case i mess up with something later, save a copy of the dataframe
aa_locations_copy = aa_locations.copy()
aa_locations_copy

Unnamed: 0,Name,Address,Coordinates,id
0,Frita Batidos,"[117 W Washington St, Ann Arbor, MI 48104]","{'latitude': 42.2803651, 'longitude': -83.7491...",uTMqhmpgfpDMLN3W3YvMeQ
1,Sava's,"[216 S State St, Ann Arbor, MI 48104]","{'latitude': 42.279605, 'longitude': -83.7409649}",Fv2VLzVj9ATLcTbFehTDjg
2,Aventura,"[216 E Washington St, Ann Arbor, MI 48104]","{'latitude': 42.280283, 'longitude': -83.746494}",yNIYH9041m1JEyRS-N_LNw
3,Blue LLama Jazz Club,"[314 S Main St, Ann Arbor, MI 48104]","{'latitude': 42.279019, 'longitude': -83.749017}",tZmiwMg9Hdc8PcsKSuWQOg
4,Mani Osteria & Bar,"[341 E Liberty St, Ann Arbor, MI 48104]","{'latitude': 42.2795793749763, 'longitude': -8...",4REtzXpQYy8dVev8RjWbSQ
...,...,...,...,...
2815,Arby's,"[195 Baker Rd, Dexter, MI 48130]","{'latitude': 42.29627548731949, 'longitude': -...",kJmVHN-GMwVrteextgp_eA
2816,Subway,"[1010 E Michigan Ave, Saline, MI 48176]","{'latitude': 42.17350939912832, 'longitude': -...",XX8t6iNKDZq9eWHxlnVakQ
2817,St. Joe's Market,"[5301 McAuley Dr, Ypsilanti Township, MI 48197]","{'latitude': 42.2669029, 'longitude': -83.651123}",Q5ul2KsOCms343REVhMa9w
2818,Marco's Pizza,"[6065 Rawsonville Rd, Belleville, MI 48111]","{'latitude': 42.20276653223271, 'longitude': -...",7ooBlkHpLEDqlidn49QN5A


In [116]:
# Originally, the address is not in string form. I use the following code to turn the address to string form
full_address_aa = []
for i in aa_locations["Address"]:
    full_address_aa.append(" ".join([j for j in i]))
aa_locations["Address"] = full_address_aa

#Extract Ann Arbor's latitude and longitude from the coordinate column as columns
aa_latitude = []
aa_longitude = []
for i in aa_locations["Coordinates"]:
    aa_latitude.append(i["latitude"])
    aa_longitude.append(i["longitude"])
aa_locations["latitude"] = aa_latitude
aa_locations["longitude"] = aa_longitude


In [89]:
""" # Use a list to store all json files of Galveston getting from Business details API
# for debug purpose, not using the data here
aa_json_list = []
endpoint_url = "https://api.yelp.com/v3/businesses/"
for i in aa_locations["id"]:
    endpoint_url_detail = endpoint_url + i
    res = requests.get(url=endpoint_url_detail,
                        headers={"Authorization":API_KEY}
                        )
    response = json.loads(res.content)
    aa_json_list.append(response)  """

In [117]:
# Use a Pandas Dataframe to store all json files of Ann Arbor getting from Business details API
from pandas import json_normalize
aa_df_list = []
endpoint_url = "https://api.yelp.com/v3/businesses/"
# Use ID as key to extract data
for i in aa_locations["id"]:
    endpoint_url_detail = endpoint_url + i
    res = requests.get(url=endpoint_url_detail,
                        headers={"Authorization":API_KEY}
                        )
    df = json_normalize(res.json())
    aa_df_list.append(df)
aa_details = pd.concat(aa_df_list)

In [119]:
# in case i mess up with something later, save a copy of the dataframe
aa_details_copy = aa_details.copy()
aa_details_copy.head()

Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,review_count,...,location.zip_code,location.country,location.state,location.display_address,location.cross_streets,coordinates.latitude,coordinates.longitude,special_hours,messaging.url,messaging.use_case_text
0,uTMqhmpgfpDMLN3W3YvMeQ,frita-batidos-ann-arbor,Frita Batidos,https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL5...,True,False,https://www.yelp.com/biz/frita-batidos-ann-arb...,17347612882,(734) 761-2882,2145,...,48104,US,MI,"[117 W Washington St, Ann Arbor, MI 48104]",,42.280365,-83.749153,,,
0,yNIYH9041m1JEyRS-N_LNw,aventura-ann-arbor,Aventura,https://s3-media3.fl.yelpcdn.com/bphoto/YeASIZ...,True,False,https://www.yelp.com/biz/aventura-ann-arbor?ad...,17343693153,(734) 369-3153,857,...,48104,US,MI,"[216 E Washington St, Ann Arbor, MI 48104]",,42.280283,-83.746494,,,
0,Fv2VLzVj9ATLcTbFehTDjg,savas-ann-arbor,Sava's,https://s3-media1.fl.yelpcdn.com/bphoto/FZHW89...,True,False,https://www.yelp.com/biz/savas-ann-arbor?adjus...,17346232233,(734) 623-2233,1281,...,48104,US,MI,"[216 S State St, Ann Arbor, MI 48104]",,42.279605,-83.740965,,,
0,tZmiwMg9Hdc8PcsKSuWQOg,blue-llama-jazz-club-ann-arbor,Blue LLama Jazz Club,https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z9...,True,False,https://www.yelp.com/biz/blue-llama-jazz-club-...,17343723200,(734) 372-3200,103,...,48104,US,MI,"[314 S Main St, Ann Arbor, MI 48104]",,42.279019,-83.749017,,,
0,4REtzXpQYy8dVev8RjWbSQ,mani-osteria-and-bar-ann-arbor,Mani Osteria & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/LJscB4...,True,False,https://www.yelp.com/biz/mani-osteria-and-bar-...,17347696700,(734) 769-6700,956,...,48104,US,MI,"[341 E Liberty St, Ann Arbor, MI 48104]",,42.279579,-83.744245,,,


In [120]:
# Define a function to help as detect NaN value
def isNaN(num):
    return num != num

# Get all rows that did not get accurate information due to "too many requests" error
# I need to replace these rows with error messages to rows with accurate business details information
aa_nan = []
for i in range(len(list(aa_details["name"]))):
    if isNaN(list(aa_details["name"])[i]):
        aa_nan.append(i)

In [122]:
# in case I mess up with something later, save a copy of the list
aa_nan_copy = aa_nan.copy()

In [123]:
aa_nan

[]

In [103]:
""" # Get the accurate information of the rows with problems that I located above
# please note that you will not neccessarily get empty rows if you run the code again
# also the index of empty rows will be different each time
# so please check aa_nan first before running anything
endpoint_url = "https://api.yelp.com/v3/businesses/"
aa_nan_detail_holder = pd.DataFrame()
for num in aa_nan:
    id_num = aa_locations.iloc[num]["id"]
    endpoint_url_detail = endpoint_url + id_num
    res = requests.get(url=endpoint_url_detail,
                            headers={"Authorization":API_KEY}
                            )
    aa_nan_detail = json_normalize(res.json())
    aa_nan_detail_holder = pd.concat([aa_nan_detail_holder, aa_nan_detail])

    # Reset index and drop the blank line
    # aa_details = aa_details.reset_index()
    aa_details = aa_details.drop(num)

# Merge the row with the accurate info I missed before to the original dataset
aa_details = aa_details.iloc[:,1:]
aa_details_final = pd.concat([aa_details, aa_nan_detail_holder])
aa_details_final = aa_details_final.reset_index()

# Clean unused column
aa_details_final = aa_details_final.iloc[:,1:] """

KeyError: '[2179] not found in axis'

In [127]:
# display max columns
pd.set_option('display.max_columns', None)
aa_details

Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,review_count,categories,rating,photos,price,hours,transactions,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,location.cross_streets,coordinates.latitude,coordinates.longitude,special_hours,messaging.url,messaging.use_case_text
0,uTMqhmpgfpDMLN3W3YvMeQ,frita-batidos-ann-arbor,Frita Batidos,https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL5...,True,False,https://www.yelp.com/biz/frita-batidos-ann-arb...,+17347612882,(734) 761-2882,2145,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,[https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL...,$$,"[{'open': [{'is_overnight': False, 'start': '1...","[delivery, pickup]",117 W Washington St,,,Ann Arbor,48104,US,MI,"[117 W Washington St, Ann Arbor, MI 48104]",,42.280365,-83.749153,,,
0,yNIYH9041m1JEyRS-N_LNw,aventura-ann-arbor,Aventura,https://s3-media3.fl.yelpcdn.com/bphoto/YeASIZ...,True,False,https://www.yelp.com/biz/aventura-ann-arbor?ad...,+17343693153,(734) 369-3153,857,"[{'alias': 'tapas', 'title': 'Tapas Bars'}, {'...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/YeASI...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],216 E Washington St,,,Ann Arbor,48104,US,MI,"[216 E Washington St, Ann Arbor, MI 48104]",,42.280283,-83.746494,,,
0,Fv2VLzVj9ATLcTbFehTDjg,savas-ann-arbor,Sava's,https://s3-media1.fl.yelpcdn.com/bphoto/FZHW89...,True,False,https://www.yelp.com/biz/savas-ann-arbor?adjus...,+17346232233,(734) 623-2233,1281,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.0,[https://s3-media1.fl.yelpcdn.com/bphoto/FZHW8...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],216 S State St,,,Ann Arbor,48104,US,MI,"[216 S State St, Ann Arbor, MI 48104]",,42.279605,-83.740965,,,
0,tZmiwMg9Hdc8PcsKSuWQOg,blue-llama-jazz-club-ann-arbor,Blue LLama Jazz Club,https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z9...,True,False,https://www.yelp.com/biz/blue-llama-jazz-club-...,+17343723200,(734) 372-3200,103,"[{'alias': 'jazzandblues', 'title': 'Jazz & Bl...",4.5,[https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...","[delivery, pickup]",314 S Main St,,,Ann Arbor,48104,US,MI,"[314 S Main St, Ann Arbor, MI 48104]",,42.279019,-83.749017,,,
0,4REtzXpQYy8dVev8RjWbSQ,mani-osteria-and-bar-ann-arbor,Mani Osteria & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/LJscB4...,True,False,https://www.yelp.com/biz/mani-osteria-and-bar-...,+17347696700,(734) 769-6700,956,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/LJscB...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],341 E Liberty St,,,Ann Arbor,48104,US,MI,"[341 E Liberty St, Ann Arbor, MI 48104]",,42.279579,-83.744245,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,4ZtSjxdv7lXd_RTCkAbawg,polo-fields-golf-and-country-clubs-ypsilanti,Polo Fields Golf & Country Clubs,https://s3-media3.fl.yelpcdn.com/bphoto/dzsLMt...,True,False,https://www.yelp.com/biz/polo-fields-golf-and-...,+17349981555,(734) 998-1555,2,"[{'alias': 'golf', 'title': 'Golf'}, {'alias':...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/dzsLM...,,"[{'open': [{'is_overnight': False, 'start': '0...",[],2955 Packard Rd,,,Ypsilanti,48197,US,MI,"[2955 Packard Rd, Ypsilanti, MI 48197]",,42.242075,-83.655158,,,
0,O7pfu3-HWLHuI8MNZFGOzg,glencoe-lakes-nature-trail-ann-arbor,Glencoe Lakes Nature Trail,https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu2...,False,False,https://www.yelp.com/biz/glencoe-lakes-nature-...,,,1,"[{'alias': 'parks', 'title': 'Parks'}]",3.0,[https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu...,,,[],4741 Washtenaw Ave,,,Ann Arbor,48108,US,MI,"[4741 Washtenaw Ave, Ann Arbor, MI 48108]",,42.251346,-83.667178,,,
0,4WD8MHDHpl_JFzbmduodyQ,little-mac-shack-ann-arbor-2,Little Mac Shack,https://s3-media2.fl.yelpcdn.com/bphoto/QKtLUM...,False,False,https://www.yelp.com/biz/little-mac-shack-ann-...,+17349975399,(734) 997-5399,1,"[{'alias': 'fooddeliveryservices', 'title': 'F...",1.0,[https://s3-media2.fl.yelpcdn.com/bphoto/QKtLU...,,,[delivery],,,,Ann Arbor,48104,US,MI,"[Ann Arbor, MI 48104]",,42.267006,-83.727127,,,
0,PU93OtU8LnfW-lB-oBjSDg,speedway-whitmore-lake,Speedway,https://s3-media4.fl.yelpcdn.com/bphoto/slYlJ7...,True,False,https://www.yelp.com/biz/speedway-whitmore-lak...,+17344498990,(734) 449-8990,2,"[{'alias': 'servicestations', 'title': 'Gas St...",3.5,[https://s3-media3.fl.yelpcdn.com/bphoto/7mP8j...,,"[{'open': [{'is_overnight': True, 'start': '00...",[delivery],300 Six Mile Road,,,Whitmore Lake,48115,US,MI,"[300 Six Mile Road, Whitmore Lake, MI 48115]",,42.393310,-83.761600,,,


In [128]:
## Clean unused column
aa_details_final = aa_details.iloc[:,:-3]

In [129]:
aa_details_final

Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,review_count,categories,rating,photos,price,hours,transactions,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,location.cross_streets,coordinates.latitude,coordinates.longitude
0,uTMqhmpgfpDMLN3W3YvMeQ,frita-batidos-ann-arbor,Frita Batidos,https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL5...,True,False,https://www.yelp.com/biz/frita-batidos-ann-arb...,+17347612882,(734) 761-2882,2145,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,[https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL...,$$,"[{'open': [{'is_overnight': False, 'start': '1...","[delivery, pickup]",117 W Washington St,,,Ann Arbor,48104,US,MI,"[117 W Washington St, Ann Arbor, MI 48104]",,42.280365,-83.749153
0,yNIYH9041m1JEyRS-N_LNw,aventura-ann-arbor,Aventura,https://s3-media3.fl.yelpcdn.com/bphoto/YeASIZ...,True,False,https://www.yelp.com/biz/aventura-ann-arbor?ad...,+17343693153,(734) 369-3153,857,"[{'alias': 'tapas', 'title': 'Tapas Bars'}, {'...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/YeASI...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],216 E Washington St,,,Ann Arbor,48104,US,MI,"[216 E Washington St, Ann Arbor, MI 48104]",,42.280283,-83.746494
0,Fv2VLzVj9ATLcTbFehTDjg,savas-ann-arbor,Sava's,https://s3-media1.fl.yelpcdn.com/bphoto/FZHW89...,True,False,https://www.yelp.com/biz/savas-ann-arbor?adjus...,+17346232233,(734) 623-2233,1281,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.0,[https://s3-media1.fl.yelpcdn.com/bphoto/FZHW8...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],216 S State St,,,Ann Arbor,48104,US,MI,"[216 S State St, Ann Arbor, MI 48104]",,42.279605,-83.740965
0,tZmiwMg9Hdc8PcsKSuWQOg,blue-llama-jazz-club-ann-arbor,Blue LLama Jazz Club,https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z9...,True,False,https://www.yelp.com/biz/blue-llama-jazz-club-...,+17343723200,(734) 372-3200,103,"[{'alias': 'jazzandblues', 'title': 'Jazz & Bl...",4.5,[https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...","[delivery, pickup]",314 S Main St,,,Ann Arbor,48104,US,MI,"[314 S Main St, Ann Arbor, MI 48104]",,42.279019,-83.749017
0,4REtzXpQYy8dVev8RjWbSQ,mani-osteria-and-bar-ann-arbor,Mani Osteria & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/LJscB4...,True,False,https://www.yelp.com/biz/mani-osteria-and-bar-...,+17347696700,(734) 769-6700,956,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/LJscB...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery],341 E Liberty St,,,Ann Arbor,48104,US,MI,"[341 E Liberty St, Ann Arbor, MI 48104]",,42.279579,-83.744245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,4ZtSjxdv7lXd_RTCkAbawg,polo-fields-golf-and-country-clubs-ypsilanti,Polo Fields Golf & Country Clubs,https://s3-media3.fl.yelpcdn.com/bphoto/dzsLMt...,True,False,https://www.yelp.com/biz/polo-fields-golf-and-...,+17349981555,(734) 998-1555,2,"[{'alias': 'golf', 'title': 'Golf'}, {'alias':...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/dzsLM...,,"[{'open': [{'is_overnight': False, 'start': '0...",[],2955 Packard Rd,,,Ypsilanti,48197,US,MI,"[2955 Packard Rd, Ypsilanti, MI 48197]",,42.242075,-83.655158
0,O7pfu3-HWLHuI8MNZFGOzg,glencoe-lakes-nature-trail-ann-arbor,Glencoe Lakes Nature Trail,https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu2...,False,False,https://www.yelp.com/biz/glencoe-lakes-nature-...,,,1,"[{'alias': 'parks', 'title': 'Parks'}]",3.0,[https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu...,,,[],4741 Washtenaw Ave,,,Ann Arbor,48108,US,MI,"[4741 Washtenaw Ave, Ann Arbor, MI 48108]",,42.251346,-83.667178
0,4WD8MHDHpl_JFzbmduodyQ,little-mac-shack-ann-arbor-2,Little Mac Shack,https://s3-media2.fl.yelpcdn.com/bphoto/QKtLUM...,False,False,https://www.yelp.com/biz/little-mac-shack-ann-...,+17349975399,(734) 997-5399,1,"[{'alias': 'fooddeliveryservices', 'title': 'F...",1.0,[https://s3-media2.fl.yelpcdn.com/bphoto/QKtLU...,,,[delivery],,,,Ann Arbor,48104,US,MI,"[Ann Arbor, MI 48104]",,42.267006,-83.727127
0,PU93OtU8LnfW-lB-oBjSDg,speedway-whitmore-lake,Speedway,https://s3-media4.fl.yelpcdn.com/bphoto/slYlJ7...,True,False,https://www.yelp.com/biz/speedway-whitmore-lak...,+17344498990,(734) 449-8990,2,"[{'alias': 'servicestations', 'title': 'Gas St...",3.5,[https://s3-media3.fl.yelpcdn.com/bphoto/7mP8j...,,"[{'open': [{'is_overnight': True, 'start': '00...",[delivery],300 Six Mile Road,,,Whitmore Lake,48115,US,MI,"[300 Six Mile Road, Whitmore Lake, MI 48115]",,42.393310,-83.761600


In [130]:
# Clean datasets. The "hours" column was messy and not in a string form, so I extracted information from the column and transited it to
# string form. Similar for the "transactions" column
aa_open = []
aa_hour_type = []
aa_is_open_now = []
aa_transactions = []
for i in range(aa_details_final.shape[0]):
    if not isNaN(aa_details_final["hours"].iloc[i]):
        aa_open.append(aa_details_final["hours"].iloc[i][0]["open"])
        aa_hour_type.append(aa_details_final["hours"].iloc[i][0]["hours_type"])
        aa_is_open_now.append(aa_details_final["hours"].iloc[i][0]["is_open_now"])
    else:
        aa_open.append("")
        aa_hour_type.append("")
        aa_is_open_now.append("")
    if (aa_details_final["transactions"].iloc[i] != []) and (not isNaN(aa_details_final["transactions"].iloc[i])):
        aa_transactions.append(aa_details_final["transactions"].iloc[i][0])
    else:
        aa_transactions.append("")

# I stored the info I extracted from "hours" and "transactions" form to new columns of the dataset
aa_details_final["open"] = aa_open
aa_details_final["hour_type"] = aa_hour_type
aa_details_final["is_open_now"] = aa_is_open_now
aa_details_final["transactions"] = aa_transactions
aa_details_final["full_address"] = full_address_aa
aa_details_final.reset_index()


Unnamed: 0,index,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,review_count,categories,rating,photos,price,hours,transactions,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,location.cross_streets,coordinates.latitude,coordinates.longitude,open,hour_type,is_open_now,full_address
0,0,uTMqhmpgfpDMLN3W3YvMeQ,frita-batidos-ann-arbor,Frita Batidos,https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL5...,True,False,https://www.yelp.com/biz/frita-batidos-ann-arb...,+17347612882,(734) 761-2882,2145,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,[https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",delivery,117 W Washington St,,,Ann Arbor,48104,US,MI,"[117 W Washington St, Ann Arbor, MI 48104]",,42.280365,-83.749153,"[{'is_overnight': False, 'start': '1100', 'end...",REGULAR,False,"117 W Washington St Ann Arbor, MI 48104"
1,0,yNIYH9041m1JEyRS-N_LNw,aventura-ann-arbor,Aventura,https://s3-media3.fl.yelpcdn.com/bphoto/YeASIZ...,True,False,https://www.yelp.com/biz/aventura-ann-arbor?ad...,+17343693153,(734) 369-3153,857,"[{'alias': 'tapas', 'title': 'Tapas Bars'}, {'...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/YeASI...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",delivery,216 E Washington St,,,Ann Arbor,48104,US,MI,"[216 E Washington St, Ann Arbor, MI 48104]",,42.280283,-83.746494,"[{'is_overnight': False, 'start': '1500', 'end...",REGULAR,False,"216 E Washington St Ann Arbor, MI 48104"
2,0,Fv2VLzVj9ATLcTbFehTDjg,savas-ann-arbor,Sava's,https://s3-media1.fl.yelpcdn.com/bphoto/FZHW89...,True,False,https://www.yelp.com/biz/savas-ann-arbor?adjus...,+17346232233,(734) 623-2233,1281,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.0,[https://s3-media1.fl.yelpcdn.com/bphoto/FZHW8...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",delivery,216 S State St,,,Ann Arbor,48104,US,MI,"[216 S State St, Ann Arbor, MI 48104]",,42.279605,-83.740965,"[{'is_overnight': False, 'start': '1100', 'end...",REGULAR,False,"216 S State St Ann Arbor, MI 48104"
3,0,tZmiwMg9Hdc8PcsKSuWQOg,blue-llama-jazz-club-ann-arbor,Blue LLama Jazz Club,https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z9...,True,False,https://www.yelp.com/biz/blue-llama-jazz-club-...,+17343723200,(734) 372-3200,103,"[{'alias': 'jazzandblues', 'title': 'Jazz & Bl...",4.5,[https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",delivery,314 S Main St,,,Ann Arbor,48104,US,MI,"[314 S Main St, Ann Arbor, MI 48104]",,42.279019,-83.749017,"[{'is_overnight': False, 'start': '1800', 'end...",REGULAR,False,"314 S Main St Ann Arbor, MI 48104"
4,0,4REtzXpQYy8dVev8RjWbSQ,mani-osteria-and-bar-ann-arbor,Mani Osteria & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/LJscB4...,True,False,https://www.yelp.com/biz/mani-osteria-and-bar-...,+17347696700,(734) 769-6700,956,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/LJscB...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",delivery,341 E Liberty St,,,Ann Arbor,48104,US,MI,"[341 E Liberty St, Ann Arbor, MI 48104]",,42.279579,-83.744245,"[{'is_overnight': False, 'start': '1600', 'end...",REGULAR,False,"341 E Liberty St Ann Arbor, MI 48104"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,4ZtSjxdv7lXd_RTCkAbawg,polo-fields-golf-and-country-clubs-ypsilanti,Polo Fields Golf & Country Clubs,https://s3-media3.fl.yelpcdn.com/bphoto/dzsLMt...,True,False,https://www.yelp.com/biz/polo-fields-golf-and-...,+17349981555,(734) 998-1555,2,"[{'alias': 'golf', 'title': 'Golf'}, {'alias':...",4.0,[https://s3-media3.fl.yelpcdn.com/bphoto/dzsLM...,,"[{'open': [{'is_overnight': False, 'start': '0...",,2955 Packard Rd,,,Ypsilanti,48197,US,MI,"[2955 Packard Rd, Ypsilanti, MI 48197]",,42.242075,-83.655158,"[{'is_overnight': False, 'start': '0730', 'end...",REGULAR,False,"2955 Packard Rd Ypsilanti, MI 48197"
996,0,O7pfu3-HWLHuI8MNZFGOzg,glencoe-lakes-nature-trail-ann-arbor,Glencoe Lakes Nature Trail,https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu2...,False,False,https://www.yelp.com/biz/glencoe-lakes-nature-...,,,1,"[{'alias': 'parks', 'title': 'Parks'}]",3.0,[https://s3-media1.fl.yelpcdn.com/bphoto/l0BQu...,,,,4741 Washtenaw Ave,,,Ann Arbor,48108,US,MI,"[4741 Washtenaw Ave, Ann Arbor, MI 48108]",,42.251346,-83.667178,,,,"4741 Washtenaw Ave Ann Arbor, MI 48108"
997,0,4WD8MHDHpl_JFzbmduodyQ,little-mac-shack-ann-arbor-2,Little Mac Shack,https://s3-media2.fl.yelpcdn.com/bphoto/QKtLUM...,False,False,https://www.yelp.com/biz/little-mac-shack-ann-...,+17349975399,(734) 997-5399,1,"[{'alias': 'fooddeliveryservices', 'title': 'F...",1.0,[https://s3-media2.fl.yelpcdn.com/bphoto/QKtLU...,,,delivery,,,,Ann Arbor,48104,US,MI,"[Ann Arbor, MI 48104]",,42.267006,-83.727127,,,,"Ann Arbor, MI 48104"
998,0,PU93OtU8LnfW-lB-oBjSDg,speedway-whitmore-lake,Speedway,https://s3-media4.fl.yelpcdn.com/bphoto/slYlJ7...,True,False,https://www.yelp.com/biz/speedway-whitmore-lak...,+17344498990,(734) 449-8990,2,"[{'alias': 'servicestations', 'title': 'Gas St...",3.5,[https://s3-media3.fl.yelpcdn.com/bphoto/7mP8j...,,"[{'open': [{'is_overnight': True, 'start': '00...",delivery,300 Six Mile Road,,,Whitmore Lake,48115,US,MI,"[300 Six Mile Road, Whitmore Lake, MI 48115]",,42.393310,-83.761600,"[{'is_overnight': True, 'start': '0000', 'end'...",REGULAR,True,"300 Six Mile Road Whitmore Lake, MI 48115"


In [131]:
# Transform the dataframe to a csv
aa_details_final.to_csv("AA_Yelp_details.csv")

In [133]:
from fuzzywuzzy import process
import warnings
warnings.filterwarnings('ignore')

In [156]:
# importing the data

aa_yelp = pd.read_csv("AA_Yelp_details.csv", comment="#")
aa_clct_common = pd.read_csv("aa_clct_common.csv", comment="#")
aa_clct_uncommon = pd.read_csv("aa_clct_uncommon.csv", comment="#")
aa_resid = pd.read_csv("aa_resid.csv", comment="#")
aa_common = pd.read_csv("aa_common.csv", comment="#")
aa_uncommon = pd.read_csv("aa_uncommon.csv", comment="#")

In [157]:
# define all the neccessary functions in the data cleaning and merging process

def merge_dataframes(*args):
    concat = pd.concat(args)
    
    return concat

def remove_standard_duplicates(concat):
    merged = concat.drop_duplicates(subset=['Name', 'Address'])
    merged = merged.drop(columns=['Unnamed: 0'])
    #merged = merged[[merged.columns[0], merged.columns[1], merged.columns[2]]]
    
    return merged


def remove_fuzzy_duplicates(merged):
    merged['Name + Address'] = merged['Name'] + merged['Address']
    merged = merged.astype(str)
    unique_strings = list(merged['Name + Address'].unique())
    merged['second_best_match'] = merged['Name + Address'].apply(lambda x: process.extract(x, unique_strings, limit=2)[1][0])
    merged['second_best_score'] = merged['Name + Address'].apply(lambda x: process.extract(x, unique_strings, limit=2)[1][1])
    potential_matches = merged.loc[merged['second_best_score'] >= 92]
    potential_matches = potential_matches.sort_values('Name')
    potential_matches = potential_matches.drop(potential_matches.index[::2])
    complete = merged.loc[merged['second_best_score'] < 92]
    complete = complete.append(potential_matches)
    
    return complete

# creating two new columns for latitude and longitude
def lat_lng(df):
    lat = []
    long = []
    for i in range(len(df)):
        if isNaN(df['Coordinates'][i]):
            df['Coordinates'][i] = "{'lat': None, 'lng': None}"
        df['Coordinates'][i] = eval(df['Coordinates'][i])
        try:
            lat.append(df['Coordinates'][i]['lat'])
            long.append(df['Coordinates'][i]['lng'])
        except:
            lat.append(df['Coordinates'][i]['latitude'])
            long.append(df['Coordinates'][i]['longitude'])
    df['Latitude'] = lat
    df['Longitude'] = long

In [158]:
# recode all coordinates
lat_lng(aa_common)
lat_lng(aa_uncommon)
lat_lng(aa_clct_common)
lat_lng(aa_clct_uncommon)
lat_lng(aa_resid)

In [159]:
# recode yelp location type
for i in range(len(aa_yelp)):
    lst = eval(aa_yelp['categories'][i])
    type_list = []
    for j in range(len(lst)):
        type_list.append(lst[j]['alias'])
    aa_yelp['categories'][i] = type_list

In [161]:
# rename columns for consistency
aa_yelp = aa_yelp.rename(columns={
    'name': 'Name',
    'coordinates.latitude': 'Latitude',
    'coordinates.longitude': 'Longitude',
    'categories': 'category',
    'full_address': 'Address',
    'is_closed': 'business_status',
    'review_count': 'user_ratings_total',
    'price': 'price_level'
})

In [166]:
# as the first step of merging,
# merge and drop standard duplicates
aa_merged = merge_dataframes(aa_yelp, aa_clct_common, aa_clct_uncommon, aa_resid, aa_common, aa_uncommon)
aa_merged_clean = remove_standard_duplicates(aa_merged)

In [167]:
# drop blank columns
aa_merged_clean = aa_merged_clean.drop(
    ['alias', 'is_claimed', 'phone', 'transactions', 'location.address1', 
        'location.address2', 'location.address3', 'location.city', 'location.zip_code',
        'location.country', 'location.state', 'location.display_address', 'location.cross_streets', 
        'open', 'is_open_now', 'delivery', 'dine_in', 'takeout', 'serves_breakfast',
        'serves_brunch', 'serves_lunch', 'serves_dinner', 'serves_vegetarian_food', 'serves_beer', 'serves_wine',
        'types', 'Coordinates'
    ]
    , axis=1)

In [169]:
# fuzzywuzzy
# note this cell takes a long time to run
aa_merged_clean = remove_fuzzy_duplicates(aa_merged_clean)

In [171]:
aa_merged_clean.head()

Unnamed: 0,id,Name,image_url,business_status,url,display_phone,user_ratings_total,category,rating,photos,price_level,hours,Latitude,Longitude,hour_type,Address,place_id,business_hours,business_hours_text,Name + Address,second_best_match,second_best_score
0,uTMqhmpgfpDMLN3W3YvMeQ,Frita Batidos,https://s3-media3.fl.yelpcdn.com/bphoto/ZbbAL5...,False,https://www.yelp.com/biz/frita-batidos-ann-arb...,(734) 761-2882,2145.0,"['cuban', 'burgers']",4.5,['https://s3-media3.fl.yelpcdn.com/bphoto/ZbbA...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",42.2803651,-83.7491532,REGULAR,"117 W Washington St Ann Arbor, MI 48104",,,,"Frita Batidos117 W Washington St Ann Arbor, MI...",University of Michigan Museum of Natural Histo...,86
1,yNIYH9041m1JEyRS-N_LNw,Aventura,https://s3-media3.fl.yelpcdn.com/bphoto/YeASIZ...,False,https://www.yelp.com/biz/aventura-ann-arbor?ad...,(734) 369-3153,857.0,"['tapas', 'spanish', 'bars']",4.0,['https://s3-media3.fl.yelpcdn.com/bphoto/YeAS...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",42.280283,-83.746494,REGULAR,"216 E Washington St Ann Arbor, MI 48104",,,,"Aventura216 E Washington St Ann Arbor, MI 48104","Mash211 E Washington St Ann Arbor, MI 48104",87
2,Fv2VLzVj9ATLcTbFehTDjg,Sava's,https://s3-media1.fl.yelpcdn.com/bphoto/FZHW89...,False,https://www.yelp.com/biz/savas-ann-arbor?adjus...,(734) 623-2233,1281.0,"['bars', 'breakfast_brunch', 'tradamerican']",4.0,['https://s3-media1.fl.yelpcdn.com/bphoto/FZHW...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",42.279605,-83.7409649,REGULAR,"216 S State St Ann Arbor, MI 48104",,,,"Sava's216 S State St Ann Arbor, MI 48104","aMa Bistro215 S State St Ann Arbor, MI 48104",88
3,tZmiwMg9Hdc8PcsKSuWQOg,Blue LLama Jazz Club,https://s3-media2.fl.yelpcdn.com/bphoto/vcn9Z9...,False,https://www.yelp.com/biz/blue-llama-jazz-club-...,(734) 372-3200,103.0,"['jazzandblues', 'musicvenues', 'newamerican']",4.5,['https://s3-media2.fl.yelpcdn.com/bphoto/vcn9...,$$$,"[{'open': [{'is_overnight': False, 'start': '1...",42.279019,-83.749017,REGULAR,"314 S Main St Ann Arbor, MI 48104",,,,"Blue LLama Jazz Club314 S Main St Ann Arbor, M...",University of Michigan Museum of Natural Histo...,86
4,4REtzXpQYy8dVev8RjWbSQ,Mani Osteria & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/LJscB4...,False,https://www.yelp.com/biz/mani-osteria-and-bar-...,(734) 769-6700,956.0,"['italian', 'pizza', 'cocktailbars']",4.0,['https://s3-media3.fl.yelpcdn.com/bphoto/LJsc...,$$,"[{'open': [{'is_overnight': False, 'start': '1...",42.2795793749763,-83.744244702611,REGULAR,"341 E Liberty St Ann Arbor, MI 48104",,,,"Mani Osteria & Bar341 E Liberty St Ann Arbor, ...","Eat407 N 5th Ave Ann Arbor, MI 48104",86


In [172]:
aa_sites = aa_merged_clean.copy()
# for 'FALSE'/'False' in business_status, change it to 'OPERATIONAL'
aa_sites['business_status'] = aa_sites['business_status'].replace('FALSE', 'OPERATIONAL')
aa_sites['business_status'] = aa_sites['business_status'].replace('False', 'OPERATIONAL')
# rename columns so that we can distinguish google id and yelp id
aa_sites.rename(columns={'place_id': 'google_id', 'id': 'yelp_id'}, inplace=True)

In [174]:
name_counts = pd.DataFrame(aa_sites['Name'].value_counts())
name_counts[name_counts['Name'] > 2]
# look for chains that might be local that I'd be unfamiliar with (do a quick google search to see what they look like)
# note that this is just for my own exploration
# this code do not have any impact on the final dataset

Unnamed: 0,Name
Starbucks,22
Zipcar,19
Subway,18
McDonald's,16
Jimmy John's,14
...,...
Noodles & Company,3
Blue Café,3
Detroit Wing Company,3
Red Robin Gourmet Burgers and Brews,3


In [182]:
# Ellie wants a FUE-only dataset, so here we do an extra step of filtering
aa_FUEs = aa_sites[(
    (aa_sites['category'] == 'bar') | (aa_sites['category'] == 'cafe') | 
    (aa_sites['category'] == 'food') | (aa_sites['category'] == 'restaurant') |
    (aa_sites['category'] == 'bakery') | (aa_sites['category'] == 'meal_delivery') |
    (aa_sites['category'] == 'meal_takeaway') | (aa_sites['category'] == 'convenience_store') |
    (aa_sites['category'] == 'school') | (aa_sites['category'] == 'grocery_or_supermarket') |
    (aa_sites['category'] == 'university') | (aa_sites['category'] == 'supermarket') |
    (aa_sites['category'] == 'secondary_school') | (aa_sites['category'] == 'primary_school') |
    (aa_sites['category'] == 'airport') | (aa_sites['category'] == 'amusement_park') |
    (aa_sites['category'] == 'aquarium') | (aa_sites['category'] == 'movie_theater') )]

In [183]:
# final step!
# save everything!
aa_FUEs.to_csv('aa_FUEs.csv')
aa_sites.to_csv('aa_complete.csv')

In [184]:
len(aa_sites)
# this is the length of aa_sites with duplicates

3180

In [185]:
# save a copy of aa_sites in case i mess up with something later
aa_sites_copy = aa_sites.copy()

In [193]:
# now we do some exploration to avoid the manual checking and removing duplicates process
# since 1 degree longtitude is 111 km, we know that 0.00001 degree is about 1 meter
# we try rounding the coordinates and use that as a standard to drop duplicates

# first transform the coordinates to float
aa_sites['Latitude'] = aa_sites['Latitude'].astype(float)
aa_sites['Longitude'] = aa_sites['Longitude'].astype(float)

# then round the coordinates
aa_sites['Latitude'] = aa_sites['Latitude'].round(5)
aa_sites['Longitude'] = aa_sites['Longitude'].round(5)

In [194]:
# then drop duplicates
aa_sites.drop_duplicates(subset=['Latitude', 'Longitude'], inplace=True)

In [196]:
len(aa_sites)

# note that our manual dropping left us with 2650 rows
# here this method left us with 2849 rows
# which means that rounding the coordinates to 5 digits as a criteria is actually good

2849

# Below is the extra processing I did after manual cleaning

In [6]:
aa_sites = pd.read_csv('../data/aa_complete.csv', comment="#")

In [7]:
aa_sites.shape

(2650, 24)

In [11]:
# display max columns
pd.set_option('display.max_columns', None)
aa_sites.head()

Unnamed: 0.1,Unnamed: 0,yelp_id,Name,image_url,business_status,url,display_phone,user_ratings_total,category,rating,photos,price_level,hours,Latitude,Longitude,hour_type,Address,google_id,business_hours,business_hours_text,Name + Address,second_best_match,second_best_score,hours_open
0,201,C8IeolsUWU0MWFY0aCEdRg,1923,https://s3-media3.fl.yelpcdn.com/bphoto/chingD...,OPERATIONAL,https://www.yelp.com/biz/1923-ann-arbor?adjust...,(734) 934-0969,8.0,"['tacos', 'foodtrucks']",5.0,['https://s3-media3.fl.yelpcdn.com/bphoto/chin...,,"[{'open': [{'is_overnight': False, 'start': '1...",42.28348,-83.75104,REGULAR,"320 Miller Ave Ann Arbor, MI 48103",,,,"1923320 Miller Ave Ann Arbor, MI 48103",Vinology Restaurant & Event Space110 S Main St...,86,
1,989,xE2I0C6uWcx2mstg-j45yA,¡Whoa! Tacos,https://s3-media3.fl.yelpcdn.com/bphoto/t0TidF...,OPERATIONAL,https://www.yelp.com/biz/whoa-tacos-metro-detr...,(734) 716-2646,3.0,"['foodtrucks', 'catering', 'tacos']",3.5,['https://s3-media3.fl.yelpcdn.com/bphoto/t0Ti...,,"[{'open': [{'is_overnight': False, 'start': '0...",42.358002,-83.214996,REGULAR,"Metro Detroit, MI 48393",,,,"¡Whoa! TacosMetro Detroit, MI 48393","Frita Batidos117 W Washington St Ann Arbor, MI...",86,
2,93,,100 E Hoover Ave Parking,,OPERATIONAL,,,,parking,,,,,42.269023,-83.748427,,"140E East Hoover Avenue, Ann Arbor",ChIJ1dF5bTGuPIgRiV9otUqoNdQ,,,100 E Hoover Ave Parking140E East Hoover Avenu...,"eat1906 Packard St Ann Arbor, MI 48104",86,
3,27,,100-198 E Ann St Parking,,OPERATIONAL,,,,parking,,,,,42.281999,-83.747725,,"100-198 E Ann St, Ann Arbor",ChIJjWxe7T2uPIgRmH_UgIrnwLk,,,"100-198 E Ann St Parking100-198 E Ann St, Ann ...","100-198 S Maple Rd Parking100-198 S Maple Rd, ...",87,
4,7,,100-198 S Maple Rd Parking,,OPERATIONAL,,,,parking,,,,,42.279602,-83.78181,,"100-198 S Maple Rd, Ann Arbor",ChIJl0oozcCxPIgRJKsvEdXECeI,,,"100-198 S Maple Rd Parking100-198 S Maple Rd, ...","100-198 E Ann St Parking100-198 E Ann St, Ann ...",87,


In [14]:
eval(aa_sites['category'][0])

['tacos', 'foodtrucks']

In [17]:
# we loop through the category column and extract the first element of each list
# if there is only a single element, we leave it as it is
category_list = []
for i in aa_sites['category']:
    try:
        values = eval(i)
        category_list.append(values[0])
    except:
        category_list.append(i)
        continue

In [20]:
aa_sites['main_category'] = category_list
aa_sites.to_csv('aa_complete.csv')

# Below is the extra processing for a FUE-only CSV file

In [3]:
import pandas as pd
aa_full_biz = pd.read_csv('../data/complete_data_files/aa_complete.csv', comment="#")

In [11]:
t = aa_full_biz['main_category'].values
t = t.tolist()
t = list(set(t))
t

[nan,
 'skate_parks',
 'vegan',
 'shoe_store',
 'car_dealer',
 'grocery',
 'supermarket',
 'breweries',
 'gourmet',
 'icecream',
 'beer_and_wine',
 'gym',
 'fooddeliveryservices',
 'szechuan',
 'jewelry_store',
 'veterinary_care',
 'hotdog',
 'general_contractor',
 'dog_parks',
 'burgers',
 'golf',
 'halal',
 'grocery_or_supermarket',
 'korean',
 'bicycle_store',
 'seafood',
 'pizza',
 'cafeteria',
 'localflavor',
 'landmarks',
 'hardware_store',
 'tradamerican',
 'health',
 'hiking',
 'bars',
 'yelpevents',
 'mexican',
 'secondary_school',
 'bus_station',
 'bakery',
 'neighborhood',
 'courthouse',
 'vinyl_records',
 'vegetarian',
 'physiotherapist',
 'coffee',
 'farmersmarket',
 'german',
 'cemetery',
 'tourist_attraction',
 'pubs',
 'furniture_store',
 'gas_station',
 'food',
 'cuban',
 'dogwalkers',
 'liquor_store',
 'tobaccoshops',
 'taxi_stand',
 'parks',
 'church',
 'insurance_agency',
 'buffets',
 'store',
 'educationservices',
 'pet_store',
 'diners',
 'atm',
 'dentist',
 'venu

In [12]:
non_FUE = ['skate_parks', 'shoe_store', 'car_dealer', 'gym', 'jewelry_store',
           'veterinary_care', 'general_contractor', 'dog_parks', 'golf', 'bicycle_store',
           'landmarks', 'hardware_store', 'health', 'hiking', 'yelpevents', 'bus_station',
            'neighborhood',  'courthouse', 'vinyl_records', 'physiotherapist', 'cemetery',
            'tourist_attraction',  'furniture_store', 'gas_station', 'dogwalkers', 'tobaccoshops',
            'taxi_stand', 'parks', 'church', 'insurance_agency', 'store', 'educationservices', 
            'pet_store', 'atm', 'dentist', 'venues', 'mosque', 'publicart', 'parking', 'finance',
            'library', 'bowling_alley', 'lawyer', 'zoos', 'fire_station', 'plumber', 'transit_station',
            'lodging', 'synagogue', 'place_of_worship', 'boating', 'stadium', 'servicestations', 'divebars',
            'indpak', 'drugstore', 'electronics_store', 'car_repair', 'gardens', 'premise', 
            'accounting', 'point_of_interest', 'painter', 'museums', 'post_office', 'funeral_home',
            'locality', 'clothing_store', 'pharmacy', 'poolhalls', 'laundry', 'playgrounds', 'local_government_office',
            'hair_care', 'doctor', 'book_store', 'movie_rental', 'moving_company', 'park', 'electrician',
             'storage', 'real_estate_agency', 'department_store', 'beauty_salon', 'bank'
           ]

In [13]:
# filter all FUEs
# they are businesses whose main_category is not in the non_FUE list
aa_FUEs = aa_full_biz[~aa_full_biz['main_category'].isin(non_FUE)]

In [15]:
aa_FUEs.reset_index(drop=True, inplace=True)

In [17]:
# save as a csv
aa_FUEs.to_csv('../data/FUE_only_data_files/aa_FUEs.csv')