# Imports and functions

In [1]:
import json
import pandas as pd
import googlemaps

In [2]:
# read google api key
with open('key.data', 'r') as key:
    api_key = key.read()

gmaps = googlemaps.Client(key=api_key)

# Get neighborhood

In [3]:
# Other terms: restaurant, attraction, hotel
term = 'restaurant'

In [4]:
df_csv = pd.read_csv('./../../data/yelp_{}.csv'.format(term))

In [5]:
print('Total records: {}'.format(len(df_csv)))

Total records: 6064


In [6]:
df_addresses = None  # comment out to continue from last business id
start_again = True  # Set to False to continue from last business id
for i, row_series in df_csv.iterrows():
    row_id = row_series['id']
    if not start_again:
        if row_id == '3dLJkahF7G_zhgqjGMbDJw':
            start_again = True
    else:
        row_lat = row_series['coordinates_latitude']
        row_long = row_series['coordinates_longitude']
        
        if pd.isnull(row_lat) or pd.isnull(row_long):
            continue
        
        coordinate = {'latitude': row_lat, 'longitude': row_long}
        response = gmaps.reverse_geocode(latlng=coordinate, language='en')

        for r in response:
            df_address = pd.json_normalize(r['address_components'])
            df_address.insert(0, 'id', row_id) 

            if df_addresses is not None:
                df_addresses = pd.concat([df_addresses, df_address], sort=False, ignore_index=True)
            else:
                df_addresses = df_address.copy()
            
    if i % 10 == 0:
        print('Processing business #{}'.format(i + 1))

Processing business #1
Processing business #11
Processing business #21
Processing business #31
Processing business #41
Processing business #51
Processing business #61
Processing business #71
Processing business #81
Processing business #91
Processing business #101
Processing business #111
Processing business #121
Processing business #131
Processing business #141
Processing business #151
Processing business #161
Processing business #171
Processing business #181
Processing business #191
Processing business #201
Processing business #211
Processing business #221
Processing business #231
Processing business #241
Processing business #251
Processing business #261
Processing business #271
Processing business #281
Processing business #291
Processing business #301
Processing business #311
Processing business #321
Processing business #331
Processing business #341
Processing business #351
Processing business #361
Processing business #371
Processing business #381
Processing business #391
Processing 

Processing business #4901
Processing business #4911
Processing business #4921
Processing business #4931
Processing business #4941
Processing business #4951
Processing business #4961
Processing business #4971
Processing business #4981
Processing business #4991
Processing business #5001
Processing business #5011
Processing business #5021
Processing business #5031
Processing business #5041
Processing business #5051
Processing business #5061
Processing business #5071
Processing business #5081
Processing business #5091
Processing business #5101
Processing business #5111
Processing business #5121
Processing business #5131
Processing business #5141
Processing business #5151
Processing business #5161
Processing business #5171
Processing business #5181
Processing business #5191
Processing business #5201
Processing business #5211
Processing business #5221
Processing business #5231
Processing business #5241
Processing business #5251
Processing business #5261
Processing business #5271
Processing b

# Attach additional data place

In [27]:
df_csv['neighborhood'] = np.nan
df_csv['city'] = np.nan
df_csv['county'] = np.nan
df_csv['state'] = np.nan
df_csv['postal_code'] = np.nan
df_csv['country'] = np.nan

In [28]:
c_neighborhood = 0
c_city = 0
c_county = 0
c_state = 0
c_postal_code = 0
c_country = 0

for index_label, row_series in df_addresses.iterrows():
    row_business_id = row_series['id']
    row_types = row_series['types']
    row_name = row_series['long_name']
    if 'neighborhood' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'neighborhood']).any():
            c_neighborhood += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'neighborhood'] = row_name
    elif 'locality' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'city']).any():
            c_city += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'city'] = row_name
    elif 'administrative_area_level_2' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'county']).any():
            c_county += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'county'] = row_name
    elif 'administrative_area_level_1' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'state']).any():
            c_state += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'state'] = row_name
    elif 'postal_code' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'postal_code']).any():
            c_postal_code += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'postal_code'] = row_name
    elif 'country' in row_types:
        if pd.isnull(df_csv.loc[(df_csv['id']==row_business_id), 'country']).any():
            c_country += 1
            df_csv.loc[(df_csv['id']==row_business_id), 'country'] = row_name
        
print('neighborhood: {}'.format(c_neighborhood))
print('city: {}'.format(c_city))
print('county: {}'.format(c_county))
print('state: {}'.format(c_state))
print('postal_code: {}'.format(c_postal_code))
print('country: {}'.format(c_country))
print('unique record: {}'.format(df_csv['place_id'].nunique()))

neighborhood: 4290
city: 6059
county: 6059
state: 6059
postal_code: 6051
country: 6059


In [40]:
print('Total records: {}'.format(len(df_csv)))
print('Total adddresses scrapped: {}'.format(len(df_addresses)))

Total records: 6064
Total adddresses scrapped: 354099


In [32]:
with open('./../../data/yelp_{}.csv'.format(term), 'w') as f:
    df_csv.to_csv(f, index=False)

In [20]:
with open('./../../data/yelp_{}_address.csv'.format(term), 'w') as f:
    df_addresses.to_csv(f, index=False)