In [12]:
#imports
import requests
import json
from configparser import ConfigParser
import csv 

In [13]:
#load the API key from a file a few directories up so that it is not stored in the git repo.
config = ConfigParser()
config.read('../../secret.cfg')
token = config.get('auth', 'token')

#set it as the header
headers = {'Authorization': 'Bearer %s' % token}

In [14]:
#set the url to the yelp API
url='https://api.yelp.com/v3/businesses/search'
 
#From the yelp API docs, there are a bunch of categories.  I went through and found ones that I believed would
#be of interest to college students.
interestingCategories = ["atvrentals", "airsoft", "amateursportsteams","amusementparks","aquariums","archery","axethrowing","badminton","baseballfields","beaches","bicyclepaths", "bikerentals", "bikeparking", "boating", "bobsledding", "bocceball", "bowling", "bungeejumping", "climbing", "daycamps", "discgolf", "diving", "escapegames", "fencing", "fishing", "fitness", "flyboarding","gokarts","golf","gun_ranges","hiking","horsebackriding","jetskis","lasertag","mini_golf","mountainbiking","paddleboarding","paintball","parks","playgrounds","races","rafting","recreation","sailing","football","swimmingpools","tennis","skatingrinks","arcades","galleries","bingo","movietheaters","eatertainment","festivals","musicvenues","theater","social_clubs","stadiumsarenas","ticketsales","wineries","acnetreatment","barbers","spas","massage","bicycles","acaibowls", "backshop", "bagels", "bakeries", "beer_and_wine", "bento", "beverage_stores", "breweries", "bubbletea", "butcher", "csa", "chimneycakes", "churros", "cideries", "coffee", "coffeeteasupplies", "coffeeroasteries", "convenience", "cupcakes", "customcakes", "delicatessen", "desserts", "distilleries", "diyfood", "donairs", "donuts", "empanadas", "farmersmarket", "fishmonger", "fooddeliveryservices", "foodtrucks",  "friterie", "gelato","grocery","hawkercentre", "honey","icecream","importedfood","intlgrocery","internetcafe","jpsweets","juicebars","kiosk", "kombucha","meaderies", "milkshakebars","gluhwein", "nasilemak","organic_stores", "panzerotti", "eltern_cafes", "cakeshop","piadinapoke", "pretzels", "shavedice", "shavedsnow", "smokehouse", "gourmet", "streetvendors", "sugarshacks", "tea", "tortillas", "wineries","adultentertainment","barcrawl","bars","beergardens","clubcrawl","coffeeshops","comedyclubs","danceclubs","dancerestaurants","karaoke","musicvenues","pianobars","poolhalls","petstore","restaurants","shopping"]


In [15]:
businesses = []  #start with empty array of businesses to build on
FIFTEEN_MILES = 24140 #this is 15 miles, measured in meters
MAX_RETURN_SIZE = 50 #the max yelp will return is 50 businesses at a time

for category in interestingCategories:  #for each category of interest...
    #set the params and make the initial request
    params = {'categories':category, 'location':'1000 Hilltop Circle Baltimore, MD 21250', 'radius': FIFTEEN_MILES, 'limit':MAX_RETURN_SIZE, 'sort_by':'rating'}
    req=requests.get(url, params=params, headers=headers)
    
    #check the status code of the response
    print('The status code is {}'.format(req.status_code))
    if req.status_code != 200:
        continue  #if it failed... just continue without breaking the process
    resp = json.loads(req.text) #load the text of the response to json
    offset = 0  #use the offset to page through the results
    numToGet = resp["total"]
    numReceived = len(resp["businesses"])
    for business in resp["businesses"]:  #create column of category for each business to save what it was found on
        business["category"] = category
    businesses.extend(resp['businesses']) #extend the business array with the businesses
    while numToGet > numReceived: #while there are still more to get...
        offset += len(resp["businesses"])
        if offset == 1000: #stop at 1000 - this is the limit set by yelp
            break
        #set the params again this time with the updated offset
        params = {'category':category, 'location':'1000 Hilltop Circle Baltimore, MD 21250', 'radius': 24140, 'limit':50, 'offset':offset, 'sort_by':'rating'}
        req=requests.get(url, params=params, headers=headers) #make the request again
        # proceed only if the status code is 200
        print('The status code is {}'.format(req.status_code))
        resp = json.loads(req.text) #load the text to json
        if 'businesses' in resp: #check for the businesses in the output
            numReceived += len(resp["businesses"]) #increase the num received by what we got
            for business in resp["businesses"]:
                business["category"] = category #add the category for these as well
        else:
            break
        businesses.extend(resp['businesses']) #extend the businesses array with the found businesses


The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 500
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status code is 200
The status 

In [11]:
#parse the results and write to a csv file
with open('../data/scrapedData.csv', mode='w') as employee_file: #open the csv file...
    writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) #create the writer...
    #write the first row with to be the headers
    writer.writerow(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count', 'categories', 'rating', 'latitude', 'longitude', 'distance', 'address', 'price', 'is_delivery', 'is_pickup', 'category'])
    for business in businesses: #loop through and write a row per business
        price = None #find the info for each business
        categoriesList = []
        categories = None
        isPickup = False
        isDelivery = False
        if 'price' in business:
            price = len(business['price']) #price represented as $, $$, $$$, $$$$ - convert to numeric to be used with sorting
        if 'categories' in business:
            for category in business['categories']:  #save the category titles in one string
                categoriesList.append(category['title'])
            categories = ", ".join(categoriesList)
        if 'delivery' in business['transactions']:
            isDelivery = True
        if 'pickup' in business['transactions']:
            isPickup = True
        writer.writerow([business["id"], business["alias"], business["name"], business["image_url"], business["is_closed"], business["url"], business["review_count"], categories, business["rating"], business["coordinates"]["latitude"], business["coordinates"]["longitude"], business["distance"], " ".join(business["location"]["display_address"]), price, isDelivery, isPickup, business["category"]])
        