In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import yelp_fusion_api as yfa
%load_ext autoreload
%autoreload 2

# Helper Functions

In [3]:
bearer_token = yfa.obtain_bearer_token(yfa.API_HOST, yfa.TOKEN_PATH)

In [39]:
def emit_relevant_yelp_fields(json):
    for business_dict in json['businesses']:
        loc = business_dict['location']
        address = ', '.join(loc['display_address'])
        zipcode = loc['zip_code']
        name = business_dict['name']
        coords = business_dict['coordinates']
        lat, long = coords['latitude'], coords['longitude']
        cats = business_dict['categories']
        categories = set(cat['alias'] for cat in cats)
        yield {'address': address,
               'name': name,
               'zipcode': zipcode,
               'categories': categories,
               'latitude': lat,
               'longitude': long}

In [40]:
def emit_all_yelp_fields(bearer_token, zipcodes, max_results=150):
    for json in yfa.emit_all(bearer_token, zipcodes, max_results=max_results):
        yield from emit_relevant_yelp_fields(json)

# Generate List of ZipCodes

In [41]:
df_zip = pd.read_csv('manhattan_zip_codes.tsv', sep='\t')
df_pop = pd.read_csv('census_pop.csv')
df_pop.columns = ['Zip Code', 'Population']
df_populated_zip = df_pop.merge(df_zip, on='Zip Code', how='inner')

# Scrape

In [93]:
df_yelp.columns

Index(['address', 'categories', 'latitude', 'longitude', 'name', 'zipcode'], dtype='object')

In [43]:
df_yelp = pd.DataFrame(emit_all_yelp_fields(bearer_token, df_populated_zip['Zip Code'], max_results=150))

Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying https://api.yelp.com/v3/businesses/search ...
Querying h

In [49]:
df_yelp.to_csv('df_yelp.csv')

# Cleanup

In [110]:
def cleanup(df_yelp, df_zip):
    df = df_yelp.copy()
    # eliminate duplicates, assume addresses are unique
    df = df.drop_duplicates(subset='address')
    
    # eliminate non-groceries
    f = lambda x: 'grocery' in x
    good_idx = df['categories'].apply(f)
    df = df[good_idx]
    
    # eliminate stores with no zipcode
    bad_idx = (df['zipcode'] == '') | df['zipcode'].isnull()
    df = df[~bad_idx]
        
    # eliminate zip codes outside of manhattan
    df['Zip Code'] = df['zipcode'].astype(int)
    df = df.merge(df_zip, how='inner', on='Zip Code')
    return df

In [115]:
df = cleanup(df_yelp, df_zip)

In [118]:
df.to_csv('df_yelp_cleaned_up.csv')

In [126]:
df[df['name'].str.contains('Associated')]

Unnamed: 0,address,categories,latitude,longitude,name,zipcode,Zip Code,Neighborhood
120,"917 9th Ave, New York, NY 10019",{grocery},40.768842,-73.985245,Associated Supermarkets,10019,10019,West Midtown
157,"409 E 14th St, New York, NY 10009","{markets, grocery}",40.73139,-73.981937,Associated Supermarkets,10009,10009,East Greenwich Village
163,"123 Ave C, New York, NY 10009",{grocery},40.724588,-73.978674,Associated Supermarkets,10009,10009,East Greenwich Village
398,"755 Amsterdam Ave, New York, NY 10025",{grocery},40.794407,-73.969911,Associated Supermarket,10025,10025,Upper West Side
402,"13 W 100th St, New York, NY 10025",{grocery},40.794678,-73.963348,Associated Supermarket,10025,10025,Upper West Side
444,"2296 Frederick Douglass Boulevard, New York, N...",{grocery},40.809091,-73.951892,Associated,10027,10027,Morningside Heights
451,"2212 3rd Ave, New York, NY 10035",{grocery},40.801048,-73.938163,Associated Supermarket,10035,10035,East Harlem
483,"3871 Broadway, New York, NY 10032",{grocery},40.83697,-73.942923,Associated Supermarket,10032,10032,Washington Heights
488,"448 Malcolm X Blvd, New York, NY 10037",{grocery},40.812362,-73.941987,Associated Supermarket,10037,10037,Harlem
510,"592 Fort Washington Ave, New York, NY 10033",{grocery},40.85489,-73.93664,Associated Supermarket,10033,10033,Washington Heights
