***
<font size=6 color='blue'>API Data Collection - Yelp</font>   
***  

**Notebook Scope:**  
This notebook includes code to retrieve ratings data from the [Yelp](https://yelp.com/) site's API. Data validation and transformation is also completed.

**Output:**  
The resulting data is saved to a Excel file for further analysis.
***  


# Notebook Setup
***

In [1]:
# Import libraries
import requests
import json
import pandas as pd

In [2]:
# Set pandas display settings
pd.set_option('max_colwidth', 30)

***  
# Submit Search
***

In [3]:
url = 'https://api.yelp.com/v3/businesses/search?location=Milwaukee%2C%20WI&attributes=restaurants_takeout&sort_by=best_match&limit=50'
headers = {'accept': 'application/json',
          'Authorization': 'Bearer x'}

response = requests.get(url, headers=headers)

In [4]:
response.status_code

200

***
# Process Response
***

In [5]:
# Load the results as JSON
results_json = json.loads(response.text)

In [6]:
# Test results by listing the name of the first business
results_json['businesses'][0]['name']

'St Paul Fish Company'

In [7]:
# Create dataframe to hold results
rest_df = pd.DataFrame(columns=['name', 'yelp_url', 'review_count', 'rating', 'price', 'categories', 'address'])

In [8]:
# Find out how many total results are available for retrieval
total_results = results_json['total']
print(total_results)

2600


In [9]:
# Process first 50 rows
for item in results_json['businesses']:
    row = len(rest_df)
    rest_df.at[row, 'name'] = item['name']
    rest_df.at[row, 'yelp_url'] = item['url']
    rest_df.at[row, 'review_count'] = item['review_count']
    rest_df.at[row, 'rating'] = item['rating']
    rest_df.at[row, 'price'] = item.get('price', '')
    rest_df.at[row, 'categories'] = [x['title'] for x in item['categories']]
    rest_df.at[row, 'address'] = ' '.join([x for x in item['location']['display_address']])

In [10]:
# Preview dataframe
rest_df.head()

Unnamed: 0,name,yelp_url,review_count,rating,price,categories,address
0,St Paul Fish Company,https://www.yelp.com/biz/s...,1360,4.4,$$,"[Seafood, Seafood Markets]","400 N Water St Milwaukee, ..."
1,Odd Duck,https://www.yelp.com/biz/o...,949,4.5,$$,"[New American, Vegetarian,...","939 S 2nd St Milwaukee, WI..."
2,Blue's Egg,https://www.yelp.com/biz/b...,1964,4.5,$$,"[Breakfast & Brunch, Ameri...","317 N 76th St Milwaukee, W..."
3,Swingin' Door Exchange,https://www.yelp.com/biz/s...,1098,4.5,$$,"[New American, Pubs]",219 E Michigan St Milwauke...
4,La Merenda,https://www.yelp.com/biz/l...,756,4.4,$$,"[Tapas Bars, Tapas/Small P...",125 E National Ave Milwauk...


In [11]:
# Repeat process for the rest of the results
base_url = 'https://api.yelp.com/v3/businesses/search?location=Milwaukee%2C%20WI&attributes=restaurants_takeout&sort_by=best_match&limit=50'
offset = 50
api_max = 200
while offset < api_max:
    url = base_url + '&offset=' + str(offset)
    response = requests.get(url, headers=headers)
    results_json = json.loads(response.text)

    for item in results_json['businesses']:
        row = len(rest_df)
        rest_df.at[row, 'name'] = item['name']
        rest_df.at[row, 'yelp_url'] = item['url']
        rest_df.at[row, 'review_count'] = item['review_count']
        rest_df.at[row, 'rating'] = item['rating']
        rest_df.at[row, 'price'] = item.get('price', '')
        rest_df.at[row, 'categories'] = [x['title'] for x in item['categories']]
        rest_df.at[row, 'address'] = ' '.join([x for x in item['location']['display_address']])
    
    offset += 50

***
# Data Prep
***

In [12]:
# Preview data
rest_df.head()

Unnamed: 0,name,yelp_url,review_count,rating,price,categories,address
0,St Paul Fish Company,https://www.yelp.com/biz/s...,1360,4.4,$$,"[Seafood, Seafood Markets]","400 N Water St Milwaukee, ..."
1,Odd Duck,https://www.yelp.com/biz/o...,949,4.5,$$,"[New American, Vegetarian,...","939 S 2nd St Milwaukee, WI..."
2,Blue's Egg,https://www.yelp.com/biz/b...,1964,4.5,$$,"[Breakfast & Brunch, Ameri...","317 N 76th St Milwaukee, W..."
3,Swingin' Door Exchange,https://www.yelp.com/biz/s...,1098,4.5,$$,"[New American, Pubs]",219 E Michigan St Milwauke...
4,La Merenda,https://www.yelp.com/biz/l...,756,4.4,$$,"[Tapas Bars, Tapas/Small P...",125 E National Ave Milwauk...


In [13]:
# Look for NaN values
rest_df.columns[rest_df.isna().any()].to_list()

[]

In [14]:
# Verify all url start with https://www.yelp.com/biz
rest_df[rest_df['yelp_url'].str.startswith('https://www.yelp.com/biz/') == False]

Unnamed: 0,name,yelp_url,review_count,rating,price,categories,address


In [15]:
# Convert review_count to int and check range of values
rest_df['review_count'] = rest_df['review_count'].astype(int)
print(f'Lowest review count: {rest_df['review_count'].min()}')
print(f'Highest review count: {rest_df['review_count'].max()}')

Lowest review count: 27
Highest review count: 1964


In [16]:
# Convert rating to float and verify values are between 0 and 5
rest_df['rating'] = rest_df['rating'].astype(float)
print(f'Lowest rating: {rest_df['rating'].min()}')
print(f'Highest rating: {rest_df['rating'].max()}')

Lowest rating: 3.5
Highest rating: 4.8


In [17]:
# Convert categories to a string
rest_df['categories'] = rest_df['categories'].str.join(', ')

***
# Address Validation
***

In [18]:
# Validate address using smarty address validation api
base_url = 'https://us-street.api.smarty.com/street-address?'
auth_id = 'x'
auth_token = 'x'
inv_count = 0

for i, row in rest_df.head().iterrows():
    str_add = row['address'].replace(' ', '+')
    url = base_url + 'street=' + str_add + '&auth-id=' + auth_id + '&auth-token=' + auth_token
    response = requests.get(url).json()
    if response == []:
        print(row['address'])
        inv_count += 1

if inv_count == 0:
    print('All addresses are valid')

All addresses are valid


***
# Write to Excel
***

In [19]:
rest_df.to_excel('Restaurants.xlsx', index=False)

***
**End**
***