# Python Script for Yelp

### This python script is used to retrieve data from yelp and store in MongoDB

#### We used yelpapi to connect to yelp servers
#### We used pymongo driver to connect to mongodb

##### Yelp business details are being fetched every hour
##### Yelp event details are being fetched on a daily basis

In [1]:
# Install dependencies

!pip install pymongo
!pip install yelpapi
!pip install pandas



In [2]:
# Import dependencies

from pymongo import MongoClient
from yelpapi import YelpAPI
from pprint import pprint
import pandas as pd
import time
from threading import Thread
import requests

In [3]:
# Connect to mongodb

client = MongoClient("mongodb://lookup:lookup@mongodb:27017/")

In [4]:
# check for successful connection

for db in client.list_databases():
    print(db)

{'name': 'admin', 'sizeOnDisk': 102400, 'empty': False}
{'name': 'config', 'sizeOnDisk': 110592, 'empty': False}
{'name': 'local', 'sizeOnDisk': 81920, 'empty': False}
{'name': 'lookup', 'sizeOnDisk': 2318336, 'empty': False}


In [5]:
API1 = 'qeamHfaZ6J_1Fj5ZsTq6DRUL3Nsza2xdbgvoYYhz3fejiXOz6VcDkMFMkLAmSBMS_aR1OCPcE5R0uKu5ebEOmeRVC3a1VxTKW4tmLwccDtSeOqxQNGLHAWzWCs10YXYx'

API2 = 'H72x7JUBRJDCWI0SNmUpE1XVzhbmUmfFqBzatoBx4VuzQSd-9hiMcO61EALxNAFAPBhWCnHm0IYZDo5f950LdZ2Cr9v_LxUHWGNVAtTIDZRCmnQP2S6jKcskk86PYXYx'

yelp_api = YelpAPI(API2)

In [6]:
df__businesses = pd.DataFrame()

list__businesses_documents = []

In [7]:
# Define Locations

locations = [
    {
        "key": "chicago",
        "value": "Chicago",
        "coordinates": {
            "lat": 41.881832,
            "lng": -87.623177,
        },
    },
    {
        "key": "new_york",
        "value": "New York",
        "coordinates": {
            "lat": 40.73061,
            "lng": -73.935242,
        },
    },
    {
        "key": "san_francisco",
        "value": "San Francisco",
        "coordinates": {
            "lat": 37.773972,
            "lng": -122.431297,
        },
    },
    {
        "key": "seattle",
        "value": "Seattle",
        "coordinates": {
            "lat": 47.608013,
            "lng": -122.335167,
        },
    },
]

alias = [
    "carpenters",
    "electricians",
    "homecleaning",
    "painters",
    "plumbing",
    "hvac",
    "waterheaterinstallrepair",
    "blinds",
]

In [None]:
business_ids = set()

def fetch_businesses_hourly():
    while True:
        for location in locations:
            for x in range(6):
                response = yelp_api.search_query(categories=alias, location=location['value'], limit=40, offset=(x*40))

                for business in response['businesses']:
                    business_ids.add(business["id"])
        
        for b_id in business_ids:
            try:
                response = yelp_api.business_query(id=b_id)
                list__businesses_documents.append(response)
            except Exception as e:
                time.sleep(0.5)
                response = yelp_api.business_query(id=b_id)
                list__businesses_documents.append(response)
                
        lookup_db = client["lookup"]
        collection = lookup_db["yelp_businesses"]
        collection.delete_many({})
        collection.insert_many(list__businesses_documents)
        
        time.sleep(3600)

In [9]:
list__businesses_documents = []
business_ids = set()

for location in locations:
    for x in range(6):
        response = yelp_api.search_query(categories=alias, location=location['value'], limit=40, offset=(x*40))

        for business in response['businesses']:
            business_ids.add(business["id"])

for b_id in business_ids:
    try:
        response = yelp_api.business_query(id=b_id)
        list__businesses_documents.append(response)
    except Exception as e:
        time.sleep(0.5)
        response = yelp_api.business_query(id=b_id)
        list__businesses_documents.append(response)

In [10]:
lookup_db = client["lookup"]
collection = lookup_db["yelp_businesses"]
collection.delete_many({})
collection.insert_many(list__businesses_documents)

<pymongo.results.InsertManyResult at 0x7f9b7976d1c0>

In [11]:
# Fetch business deals

business_ids_deals = set()
for location in locations:
    for x in range(6):
        response = yelp_api.search_query(categories=alias, attributes=['deals'], location=location['value'], limit=40, offset=(x*40))
        for business in response['businesses']:
            business_ids_deals.add(business["id"])

In [12]:
list__businesses_deals_documents = []

for b_id in business_ids_deals:
    try:
        response = yelp_api.business_query(id=b_id)
        list__businesses_deals_documents.append(response)
    except Exception as e:
        time.sleep(0.5)
        response = yelp_api.business_query(id=b_id)
        list__businesses_deals_documents.append(response)

In [13]:
# Update MongoDB

lookup_db = client["lookup"]
collection = lookup_db["yelp_deals"]
collection.delete_many({})
collection.insert_many(list__businesses_deals_documents)

<pymongo.results.InsertManyResult at 0x7f9b7974cfc0>

In [None]:
# Fetch yelp reviews

df__reviews = pd.DataFrame()
list__review_documents = []

for business in list__businesses_documents:
    try:
        response = yelp_api.reviews_query(id = business["id"])
        df__reviews = df__reviews.append(pd.DataFrame(response['reviews']))
        
        for review in response['reviews']:
            new_review = {**review, 'business_id': business["id"]}
            list__review_documents.append(new_review)
    except Exception as e:
        time.sleep(0.5)
        response = yelp_api.reviews_query(id = business["id"])
        
        for review in response['reviews']:
            new_review = {**review, 'business_id': business["id"]}
            list__review_documents.append(new_review)

In [None]:
# Update MongoDB

collection = lookup_db["yelp_reviews"]
collection.delete_many({})
collection.insert_many(list__review_documents)

In [None]:
# Fetch events daily

def fetch_events_daily():
    df__events = pd.DataFrame()
    list__events_documents = []

    for location in locations:
        for x in range(6):
            response = yelp_api.event_search_query(location=location['value'], sort_on="popularity", start_date=int(time.time()), limit=40, offset=(x*40), categories=["music", "performing-arts", "sports-active-life"])
            df__events = df__events.append(pd.DataFrame(response['events']))

            for event in response['events']:
                list__events_documents.append(event)
    
    lookup_db = client["lookup"]
    collection = lookup_db["yelp_events"]
    collection.delete_many({})
    collection.insert_many(list__events_documents)
    
    time.sleep(86400)

In [None]:
# Fetch different categories
headers = {'Authorization': 'Bearer {}'.format(API2/)}
categories_api_url = 'https://api.yelp.com/v3/categories'

response = requests.get(categories_api_url, headers=headers, timeout=5)
jsonData = response.json()

list__category_documents = []

for category in jsonData['categories']:
    for al in alias:
        if category['alias'] == al:
            list__category_documents.append(category)

In [None]:
# Update MongoDB

collection = lookup_db["yelp_categories"]
collection.delete_many({})
collection.insert_many(list__category_documents)

In [None]:
# Create threads to run both methods simultaneously

# PS: This might cause the API to exhaust and throw error as there is a limit on number of requests per day

# Thread(target = fetch_businesses_hourly).start()
# Thread(target = fetch_events_daily).start()