In [60]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import ast

### Reviews YELP

In [3]:
file_path = './review_yelp.json'
# Specify the number of rows to read per batch (e.g., 250000 rows)
batch_size = 250000

# Create an empty list to store the loaded DataFrames
dfs = []

# Read the JSON file in batches using read_json with chunksize
for chunk in pd.read_json(file_path, lines=True, chunksize=batch_size):
    dfs.append(chunk)

# Concatenate the list of DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [54]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [55]:
# Comments

df['comment'] = ''
df.loc[(df['funny'] == 1), 'comment'] = 'Funny'
df.loc[(df['useful'] == 1), 'comment'] = 'Useful'
df.loc[(df['cool'] == 1), 'comment'] = 'Cool'
df.loc[(df['funny'] == 1) & (df['useful'] == 1), 'comment'] = 'Funny Useful'
df.loc[(df['funny'] == 1) & (df['cool'] == 1), 'comment'] = 'Funny Cool'
df.loc[(df['useful'] == 1) & (df['cool'] == 1), 'comment'] = 'Useful Cool'
df.loc[(df['funny'] == 1) & (df['useful'] == 1) & (df['cool'] == 1), 'comment'] = 'Funny Useful Cool'


In [56]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


In [59]:
# Drop columns

columns_to_drop = ['useful','funny','cool']

df = df.drop(columns_to_drop, axis=1)

df.head()

Unnamed: 0,review_id,user_id,business_id,stars,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


In [62]:
df = df.rename(columns={"stars": "rating"})
df.head()

Unnamed: 0,review_id,user_id,business_id,rating,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


### Business Table

In [18]:
# Read the pickle file using read_pickle
df_stores = pd.read_pickle('business.pkl')

In [8]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,


In [11]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

In [19]:
# Assuming your DataFrame is named 'df'
df_stores = df_stores.iloc[:, :14]  # Select the first 15 columns using iloc

In [16]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [23]:
# Filtering the States of interest
states = ['NV','TX','CA','FL','NY']

df_stores = df_stores[df_stores['state'].isin(states)]

In [24]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,{'ByAppointmentOnly': 'True'},"General Dentistry, Dentists, Health & Medical,...","{'Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
16,rBmpy_Y1UbBx8ggHlyb7hA,Arizona Truck Outfitters,625 N Stone Ave,Tucson,FL,85705,32.229872,-110.972342,4.5,10,1,"{'DriveThru': 'False', 'BusinessAcceptsCreditC...","Automotive, Auto Parts & Supplies, Auto Custom...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '..."


In [31]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39228 entries, 3 to 150333
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   39228 non-null  object
 1   name          39228 non-null  object
 2   address       39228 non-null  object
 3   city          39228 non-null  object
 4   state         39228 non-null  object
 5   postal_code   39228 non-null  object
 6   latitude      39228 non-null  object
 7   longitude     39228 non-null  object
 8   stars         39228 non-null  object
 9   review_count  39228 non-null  object
 10  is_open       39228 non-null  object
 11  attributes    35613 non-null  object
 12  categories    39228 non-null  object
 13  hours         33103 non-null  object
dtypes: object(14)
memory usage: 4.5+ MB


In [28]:
df_stores = df_stores.dropna(subset=['categories'])

In [43]:
# Define the list of gastronomic tags to filter on
gastronomic_tags = ['food', 'restaurant', 'cafe', 'bar', 'Restaurant', 'Bar','Food']  # replace with your actual list of tags


# Use a nested loop to check if any of the gastronomic tags are in the category list for each row
is_gastronomic = []

df_stores['categories'] = df_stores['categories'].apply(lambda x: x.replace(', ', ',').split(','))


for categories in df_stores['categories']:
    found_tag = False
    for tag in categories:
        if tag.lower() in gastronomic_tags:
            found_tag = True
            break
    is_gastronomic.append(found_tag)

# Filter the DataFrame based on this condition
df_stores_food = df_stores[is_gastronomic]


In [42]:
df_stores['categories'].iloc[1].replace(', ',',').split(',')

['Department Stores', 'Shopping', 'Fashion']

In [44]:
df_stores_food.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7204 entries, 3 to 150321
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   7204 non-null   object
 1   name          7204 non-null   object
 2   address       7204 non-null   object
 3   city          7204 non-null   object
 4   state         7204 non-null   object
 5   postal_code   7204 non-null   object
 6   latitude      7204 non-null   object
 7   longitude     7204 non-null   object
 8   stars         7204 non-null   object
 9   review_count  7204 non-null   object
 10  is_open       7204 non-null   object
 11  attributes    7097 non-null   object
 12  categories    7204 non-null   object
 13  hours         6258 non-null   object
dtypes: object(14)
memory usage: 844.2+ KB


In [45]:
df_stores_food.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,FL,T5H 3B2,53.549633,-113.50878,5.0,20,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ..."
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,FL,19104,39.954573,-75.194894,3.0,56,1,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,114 N 3rd St,Camden,CA,08102,39.94669,-75.123327,4.5,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ..."


In [49]:
df_stores_food['address_complete'] = (
    df_stores_food['address'].astype(str) + ', ' +
    df_stores_food['city'].astype(str) + ', ' +
    df_stores_food['state'].astype(str) + ' ' +
    df_stores_food['postal_code'].astype(str)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stores_food['address_complete'] = (


In [50]:
df_stores_food

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,address_complete
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","935 Race St, Philadelphia, CA 19107"
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","2575 E Bay Dr, Largo, FL 33771"
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,FL,T5H 3B2,53.549633,-113.50878,5.0,20,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ...","10588 109 Street, Edmonton, FL T5H 3B2"
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,FL,19104,39.954573,-75.194894,3.0,56,1,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","3604 Chestnut St, Philadelphia, FL 19104"
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,114 N 3rd St,Camden,CA,08102,39.94669,-75.123327,4.5,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","114 N 3rd St, Camden, CA 08102"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150247,uakBDIQ5uDIG9QsVSv3aHA,King of Tarts,"1414 E Old York Rd, Ste E",Warminster,FL,18974,40.228769,-75.094705,4.5,20,0,"{'RestaurantsPriceRange2': '3', 'BusinessAccep...","[Bakeries, Desserts, Food]","{'Tuesday': '8:0-15:0', 'Wednesday': '8:0-15:0...","1414 E Old York Rd, Ste E, Warminster, FL 18974"
150283,Y5gyxnQt44B3axgzrjttlw,Don Cruz Snowballs,4213 Williams Blvd,Kenner,FL,70065,30.033566,-90.23852,4.5,6,1,"{'RestaurantsTakeOut': 'True', 'DogsAllowed': ...","[Mexican, Shaved Ice, Restaurants, Food, Food ...","{'Monday': '14:0-21:0', 'Tuesday': '14:0-21:0'...","4213 Williams Blvd, Kenner, FL 70065"
150293,0UqeZTDBdV0uY3wesbLvYQ,ampm,1701 Victorian Ave,Sparks,CA,89431,39.534947,-119.765698,4.0,5,1,{'BusinessAcceptsCreditCards': 'True'},"[Convenience Stores, Food, Automotive, Gas Sta...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","1701 Victorian Ave, Sparks, CA 89431"
150306,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,CA,19128,40.032483,-75.21443,3.0,55,1,"{'NoiseLevel': 'u'average'', 'RestaurantsTakeO...","[Restaurants, Specialty Food, Food, Sandwiches...","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'...","6024 Ridge Ave, Philadelphia, CA 19128"


In [52]:
# Drop columns

columns_to_drop = ['address','city','postal_code','is_open']

df_stores_food = df_stores_food.drop(columns_to_drop, axis=1)

df_stores_food.head()

Unnamed: 0,business_id,name,state,latitude,longitude,stars,review_count,attributes,categories,hours,address_complete
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,CA,39.955505,-75.155564,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","935 Race St, Philadelphia, CA 19107"
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,FL,27.916116,-82.760461,4.5,100,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","2575 E Bay Dr, Largo, FL 33771"
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,FL,53.549633,-113.50878,5.0,20,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ...","10588 109 Street, Edmonton, FL T5H 3B2"
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,FL,39.954573,-75.194894,3.0,56,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","3604 Chestnut St, Philadelphia, FL 19104"
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,CA,39.94669,-75.123327,4.5,6,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","114 N 3rd St, Camden, CA 08102"


### Formating Yelp API responses

Mi idea seria ir consultando a Cassandra o MySQL para ir barriendo la tabla de locales y, asi, ir actualizando sus reviews.

Por ejemplo, si puedo hacer 400 consultas diarias. Tomar 40 locales y traer las 10 reviews mas nuevas. Luego al dia siguiente traer los siguientes 40 locales.

PD Aca uso un ejemplo del Json que devuelve YelpAPI para no gastar creditos.

In [67]:

import json

# Specify the path to your JSON file
example_file = './review_example.json'


# Load the JSON file into a dictionary
with open(example_file, 'r') as file:
    response = json.load(file)

reviews = response['reviews']

In [68]:
reviews

[{'id': 'zY1hO4VhjYj6ApRk3FOywA',
  'rating': 5,
  'text': "This place has such unique flavors and they were all pretty good. My sis and I couldn't decide on just like 1-2 flavors so we ended up getting like 5...",
  'time_created': '2023-08-07 16:17:57',
  'url': 'https://www.yelp.com/biz/merry-monarch-creamery-austin?adjust_creative=9XlpwIp1qnfNvM0eFgnFHw&hrid=zY1hO4VhjYj6ApRk3FOywA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_reviews&utm_source=9XlpwIp1qnfNvM0eFgnFHw',
  'user': {'id': 'xQqXx9jGf-YtGg43-BBndA',
   'image_url': 'https://s3-media2.fl.yelpcdn.com/photo/0OF2rVmME_B62WXLBGCs_g/o.jpg',
   'name': 'Dolly N.',
   'profile_url': 'https://www.yelp.com/user_details?userid=xQqXx9jGf-YtGg43-BBndA'}},
 {'id': 'gUsoPKPYqpwDpcZj_91EgQ',
  'rating': 5,
  'text': 'This place was wonderful. Came here after eating at another truck in the lot for dinner. \n The gal that helped us let me try two different types of ice...',
  'time_created': '2023-08-03 18:40:56',
  'url': 'https:/

In [72]:
dictionary = {'review_id' : [],
              'user_id': [],
              'business_name': [],
              'business_id':[],
              'Platform':[],
              'rating':[],
              'date': [],
              'text':[],
              'comment':[],
}

for review in reviews:
    dictionary['review_id'].append(review['id'])
    dictionary['user_id'].append(review['user']['id'])
    dictionary['business_name'].append('I have to get it from the request')
    dictionary['business_id'].append('I have to get it from the request')
    dictionary['date'].append(review['time_created'])
    dictionary['Platform'].append('Yelp')
    dictionary['rating'].append(review['rating'])
    dictionary['text'].append(review['text'])
    dictionary['comment'].append('')


df_yelp_api = pd.DataFrame(dictionary)


						

In [73]:
df_yelp_api.head()

Unnamed: 0,review_id,user_id,business_name,business_id,Platform,rating,date,text,comment
0,zY1hO4VhjYj6ApRk3FOywA,xQqXx9jGf-YtGg43-BBndA,I have to get it from the request,I have to get it from the request,Yelp,5,2023-08-07 16:17:57,This place has such unique flavors and they we...,
1,gUsoPKPYqpwDpcZj_91EgQ,324wpRmJv5zXNSqEjRR4rQ,I have to get it from the request,I have to get it from the request,Yelp,5,2023-08-03 18:40:56,This place was wonderful. Came here after eati...,
2,IcE46kt0n_UTKSWY5vBdqw,GZeRcprW27wmkIDZBeP74A,I have to get it from the request,I have to get it from the request,Yelp,5,2023-08-03 17:34:22,The Oreo miso lives up to the hype. A super cr...,
