In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import ast

### Reviews YELP

In [3]:
file_path = './review_yelp.json'
# Specify the number of rows to read per batch (e.g., 250000 rows)
batch_size = 250000

# Create an empty list to store the loaded DataFrames
dfs = []

# Read the JSON file in batches using read_json with chunksize
for chunk in pd.read_json(file_path, lines=True, chunksize=batch_size):
    dfs.append(chunk)

# Concatenate the list of DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [54]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [55]:
# Comments

df['comment'] = ''
df.loc[(df['funny'] == 1), 'comment'] = 'Funny'
df.loc[(df['useful'] == 1), 'comment'] = 'Useful'
df.loc[(df['cool'] == 1), 'comment'] = 'Cool'
df.loc[(df['funny'] == 1) & (df['useful'] == 1), 'comment'] = 'Funny Useful'
df.loc[(df['funny'] == 1) & (df['cool'] == 1), 'comment'] = 'Funny Cool'
df.loc[(df['useful'] == 1) & (df['cool'] == 1), 'comment'] = 'Useful Cool'
df.loc[(df['funny'] == 1) & (df['useful'] == 1) & (df['cool'] == 1), 'comment'] = 'Funny Useful Cool'


In [56]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


In [59]:
# Drop columns

columns_to_drop = ['useful','funny','cool']

df = df.drop(columns_to_drop, axis=1)

df.head()

Unnamed: 0,review_id,user_id,business_id,stars,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


In [62]:
df = df.rename(columns={"stars": "rating"})
df.head()

Unnamed: 0,review_id,user_id,business_id,rating,text,date,comment
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,Useful Cool
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,Useful Cool
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,Useful Cool


### Business Table

In [2]:
# Read the pickle file using read_pickle
df_stores = pd.read_pickle('business.pkl')

In [8]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,


In [11]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

In [3]:
# Assuming your DataFrame is named 'df'
df_stores = df_stores.iloc[:, :14]  # Select the first 15 columns using iloc

In [16]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [4]:
# Filtering the States of interest
states = ['NV','TX','CA','FL','NY']

df_stores = df_stores[df_stores['state'].isin(states)]

In [24]:
df_stores.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,{'ByAppointmentOnly': 'True'},"General Dentistry, Dentists, Health & Medical,...","{'Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
16,rBmpy_Y1UbBx8ggHlyb7hA,Arizona Truck Outfitters,625 N Stone Ave,Tucson,FL,85705,32.229872,-110.972342,4.5,10,1,"{'DriveThru': 'False', 'BusinessAcceptsCreditC...","Automotive, Auto Parts & Supplies, Auto Custom...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '..."


In [31]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39228 entries, 3 to 150333
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   39228 non-null  object
 1   name          39228 non-null  object
 2   address       39228 non-null  object
 3   city          39228 non-null  object
 4   state         39228 non-null  object
 5   postal_code   39228 non-null  object
 6   latitude      39228 non-null  object
 7   longitude     39228 non-null  object
 8   stars         39228 non-null  object
 9   review_count  39228 non-null  object
 10  is_open       39228 non-null  object
 11  attributes    35613 non-null  object
 12  categories    39228 non-null  object
 13  hours         33103 non-null  object
dtypes: object(14)
memory usage: 4.5+ MB


In [5]:
df_stores = df_stores.dropna(subset=['categories'])

In [6]:
# Define the list of gastronomic tags to filter on
gastronomic_tags = ['food', 'restaurant', 'cafe', 'bar', 'Restaurant', 'Bar','Food']  # replace with your actual list of tags


# Use a nested loop to check if any of the gastronomic tags are in the category list for each row
is_gastronomic = []

df_stores['categories'] = df_stores['categories'].apply(lambda x: x.replace(', ', ',').split(','))


for categories in df_stores['categories']:
    found_tag = False
    for tag in categories:
        if tag.lower() in gastronomic_tags:
            found_tag = True
            break
    is_gastronomic.append(found_tag)

# Filter the DataFrame based on this condition
df_stores_food = df_stores[is_gastronomic]


In [7]:
df_stores['categories'].iloc[1].replace(', ',',').split(',')

AttributeError: 'list' object has no attribute 'replace'

In [44]:
df_stores_food.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7204 entries, 3 to 150321
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   7204 non-null   object
 1   name          7204 non-null   object
 2   address       7204 non-null   object
 3   city          7204 non-null   object
 4   state         7204 non-null   object
 5   postal_code   7204 non-null   object
 6   latitude      7204 non-null   object
 7   longitude     7204 non-null   object
 8   stars         7204 non-null   object
 9   review_count  7204 non-null   object
 10  is_open       7204 non-null   object
 11  attributes    7097 non-null   object
 12  categories    7204 non-null   object
 13  hours         6258 non-null   object
dtypes: object(14)
memory usage: 844.2+ KB


In [8]:
df_stores_food.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,FL,T5H 3B2,53.549633,-113.50878,5.0,20,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ..."
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,FL,19104,39.954573,-75.194894,3.0,56,1,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,114 N 3rd St,Camden,CA,08102,39.94669,-75.123327,4.5,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ..."


In [9]:
df_stores_food['address_complete'] = (
    df_stores_food['address'].astype(str) + ', ' +
    df_stores_food['city'].astype(str) + ', ' +
    df_stores_food['state'].astype(str) + ' ' +
    df_stores_food['postal_code'].astype(str)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stores_food['address_complete'] = (


In [50]:
df_stores_food

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,address_complete
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","935 Race St, Philadelphia, CA 19107"
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","2575 E Bay Dr, Largo, FL 33771"
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,FL,T5H 3B2,53.549633,-113.50878,5.0,20,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ...","10588 109 Street, Edmonton, FL T5H 3B2"
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,FL,19104,39.954573,-75.194894,3.0,56,1,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","3604 Chestnut St, Philadelphia, FL 19104"
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,114 N 3rd St,Camden,CA,08102,39.94669,-75.123327,4.5,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","114 N 3rd St, Camden, CA 08102"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150247,uakBDIQ5uDIG9QsVSv3aHA,King of Tarts,"1414 E Old York Rd, Ste E",Warminster,FL,18974,40.228769,-75.094705,4.5,20,0,"{'RestaurantsPriceRange2': '3', 'BusinessAccep...","[Bakeries, Desserts, Food]","{'Tuesday': '8:0-15:0', 'Wednesday': '8:0-15:0...","1414 E Old York Rd, Ste E, Warminster, FL 18974"
150283,Y5gyxnQt44B3axgzrjttlw,Don Cruz Snowballs,4213 Williams Blvd,Kenner,FL,70065,30.033566,-90.23852,4.5,6,1,"{'RestaurantsTakeOut': 'True', 'DogsAllowed': ...","[Mexican, Shaved Ice, Restaurants, Food, Food ...","{'Monday': '14:0-21:0', 'Tuesday': '14:0-21:0'...","4213 Williams Blvd, Kenner, FL 70065"
150293,0UqeZTDBdV0uY3wesbLvYQ,ampm,1701 Victorian Ave,Sparks,CA,89431,39.534947,-119.765698,4.0,5,1,{'BusinessAcceptsCreditCards': 'True'},"[Convenience Stores, Food, Automotive, Gas Sta...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","1701 Victorian Ave, Sparks, CA 89431"
150306,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,CA,19128,40.032483,-75.21443,3.0,55,1,"{'NoiseLevel': 'u'average'', 'RestaurantsTakeO...","[Restaurants, Specialty Food, Food, Sandwiches...","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'...","6024 Ridge Ave, Philadelphia, CA 19128"


In [52]:
# Drop columns

columns_to_drop = ['address','city','postal_code','is_open']

df_stores_food = df_stores_food.drop(columns_to_drop, axis=1)

df_stores_food.head()

Unnamed: 0,business_id,name,state,latitude,longitude,stars,review_count,attributes,categories,hours,address_complete
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,CA,39.955505,-75.155564,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","935 Race St, Philadelphia, CA 19107"
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,FL,27.916116,-82.760461,4.5,100,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","2575 E Bay Dr, Largo, FL 33771"
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,FL,53.549633,-113.50878,5.0,20,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ...","10588 109 Street, Edmonton, FL T5H 3B2"
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,FL,39.954573,-75.194894,3.0,56,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","3604 Chestnut St, Philadelphia, FL 19104"
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,CA,39.94669,-75.123327,4.5,6,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","114 N 3rd St, Camden, CA 08102"


### Formating Yelp API responses

Mi idea seria ir consultando a Cassandra o MySQL para ir barriendo la tabla de locales y, asi, ir actualizando sus reviews.

Por ejemplo, si puedo hacer 400 consultas diarias. Tomar 40 locales y traer las 10 reviews mas nuevas. Luego al dia siguiente traer los siguientes 40 locales.

PD Aca uso un ejemplo del Json que devuelve YelpAPI para no gastar creditos.

In [90]:

import json

# Specify the path to your JSON file
example_file = './review_example2.json'


# Load the JSON file into a dictionary
with open(example_file, 'r') as file:
    response = json.load(file)

reviews = response['reviews']

In [91]:
reviews

[{'id': 'VCg79Gi1UwHWhcdegm7faw',
  'rating': 5,
  'text': 'Go to Italian spot for takeout. Every meal they sell easily feeds two. I especially recommend the lasagna which is obviously homemade top to bottom and...',
  'time_created': '2018-10-20 12:15:23',
  'url': 'https://www.yelp.com/biz/zios-italian-market-largo-2?adjust_creative=9XlpwIp1qnfNvM0eFgnFHw&hrid=VCg79Gi1UwHWhcdegm7faw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_reviews&utm_source=9XlpwIp1qnfNvM0eFgnFHw',
  'user': {'id': 'p-T8R_IaiRZX_EMKdtZDrA',
   'image_url': 'https://s3-media2.fl.yelpcdn.com/photo/3T11jLZXGRXXMDLbINaDVg/o.jpg',
   'name': 'Matt -.',
   'profile_url': 'https://www.yelp.com/user_details?userid=p-T8R_IaiRZX_EMKdtZDrA'}},
 {'id': '2chTllwXOd6SmltvIynzGw',
  'rating': 1,
  'text': "Sadly it seems Zio's Italian market has closed for good. A good example of if it's not broken dont fix it",
  'time_created': '2019-01-19 10:05:47',
  'url': 'https://www.yelp.com/biz/zios-italian-market-largo-2?adjus

In [94]:
# From the Request
business_id = '0bPLkL0QhhPO5kt1_EXmNQ'
business_name = "Zio's Italian Market"


# Create dict
dictionary = {'review_id' : [],
              'user_id': [],
              'business_name': [],
              'business_id':[],
              'Platform':[],
              'rating':[],
              'date': [],
              'text':[],
              'comment':[],
}

for review in reviews:
    dictionary['review_id'].append(review['id'])
    dictionary['user_id'].append(review['user']['id'])
    dictionary['business_name'].append(business_id)
    dictionary['business_id'].append(business_name)
    dictionary['date'].append(review['time_created'])
    dictionary['Platform'].append('Yelp')
    dictionary['rating'].append(review['rating'])
    dictionary['text'].append(review['text'])
    dictionary['comment'].append('')


df_yelp_api = pd.DataFrame(dictionary)


						

In [95]:
df_yelp_api.head()

Unnamed: 0,review_id,user_id,business_name,business_id,Platform,rating,date,text,comment
0,VCg79Gi1UwHWhcdegm7faw,p-T8R_IaiRZX_EMKdtZDrA,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Yelp,5,2018-10-20 12:15:23,Go to Italian spot for takeout. Every meal the...,
1,2chTllwXOd6SmltvIynzGw,RQjtzwpJOU3-_g_KxsbgUw,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Yelp,1,2019-01-19 10:05:47,Sadly it seems Zio's Italian market has closed...,
2,2ws7mwZIMzjIeHT_Uxdakg,4ePfnwupsJplRb_TEkkhdg,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Yelp,5,2017-02-13 12:12:28,I am here today from Atlanta. I was kinda bumm...,


That request was made with the following store: 'merry-monarch-creamery-austin'

In [86]:
# Read the pickle file using read_pickle
df_search = pd.read_pickle('business.pkl')
# Assuming your DataFrame is named 'df'
df_search = df_search.iloc[:, :14]  # Select the first 15 columns using iloc

In [89]:
stores_name = "Zio's Italian Market"
stores_matching_name = df_search[df_search['name'] == stores_name]
stores_matching_name.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."


### Getting Updated Reviews

The process would be as follows:

+ As there is a finite number of request we have available everyday. The idea would be to have some kind of flag indicating if the stores has already been updated. In this context, Updated means that new reviews from the local has been gotten from the Yelp API
+ So, we get a store with the flag updated in false, then, we request new reviews for the store to the API.
+ Then, we store the new reviews for the store.
+ Finally, we change the state of the flag updated

It may be possible that after a query we didn't get all the new reviews for a store. However, that seems unlikely as most locals have up to 100 rev

In [13]:
!pip install yelpapi

Collecting yelpapi
  Downloading yelpapi-2.5.0-py3-none-any.whl (7.4 kB)
Installing collected packages: yelpapi
Successfully installed yelpapi-2.5.0


You should consider upgrading via the 'C:\Users\Marco\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [14]:
from yelpapi import YelpAPI
import argparse
from pprint import pprint

In [11]:
df_stores_food['updated'] = False

df_stores_food.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stores_food['updated'] = False


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,address_complete,updated
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","935 Race St, Philadelphia, CA 19107",False
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","[Food, Delis, Italian, Bakeries, Restaurants]","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","2575 E Bay Dr, Largo, FL 33771",False
53,cVBxfMC4lp3DnocjYA3FHQ,Paws The Cat Cafe,10588 109 Street,Edmonton,FL,T5H 3B2,53.549633,-113.50878,5.0,20,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...","[Coffee & Tea, Cafes, Pets, Restaurants, Pet A...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-21:0', ...","10588 109 Street, Edmonton, FL T5H 3B2",False
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,FL,19104,39.954573,-75.194894,3.0,56,1,"{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...","[Restaurants, Automotive, Delis, Gas Stations,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","3604 Chestnut St, Philadelphia, FL 19104",False
88,LhpPSrulqVeTyJeK2xydvQ,Fresh Fruits & Salads,114 N 3rd St,Camden,CA,08102,39.94669,-75.123327,4.5,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Juice Bars & Smoothies, Restaurants, Fruits &...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","114 N 3rd St, Camden, CA 08102",False


In [12]:
# Get the row to update

store_to_update = df_stores_food.loc[df_stores_food['updated'] == False].iloc[0]

store_to_update

business_id                                    MTSW4McQd7CbVtyjqoe9mw
name                                               St Honore Pastries
address                                                   935 Race St
city                                                     Philadelphia
state                                                              CA
postal_code                                                     19107
latitude                                                    39.955505
longitude                                                  -75.155564
stars                                                             4.0
review_count                                                       80
is_open                                                             1
attributes          {'RestaurantsDelivery': 'False', 'OutdoorSeati...
categories          [Restaurants, Food, Bubble Tea, Coffee & Tea, ...
hours               {'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...
address_complete    

In [15]:
# Get the Store ID 
store_id = store_to_update['business_id']

# Create Request
API_Key = 'Jmt0VGJfwKMUOHBlMQ4BzARUG7uqi8T_XkKps0v5KPNuxu4mz-5vwp9TQ8mMFa26ibZRjCxC0Ga39co9i9BbhzUBhC2MCdG9jBxQEaD7WGOrKMvTZo83GLXL_YraZHYx'
yelp_api = YelpAPI(API_Key)
response = yelp_api.reviews_query(id=store_id, limit = 50)
pprint(response)

{'possible_languages': ['en'],
 'reviews': [{'id': '2l02e-2QAYZnF0tbZz5a-A',
              'rating': 5,
              'text': 'Made a quick trip to Philly to get some eats! Of '
                      "course, knew we'd have to pick up some of the paper "
                      'wrapped sponge cakes from St. Honore. We called '
                      'ahead...',
              'time_created': '2022-07-25 09:43:43',
              'url': 'https://www.yelp.com/biz/st-honore-pastries-philadelphia?adjust_creative=9XlpwIp1qnfNvM0eFgnFHw&hrid=2l02e-2QAYZnF0tbZz5a-A&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_reviews&utm_source=9XlpwIp1qnfNvM0eFgnFHw',
              'user': {'id': 'CKBLVYZi8len88s1meKIoA',
                       'image_url': 'https://s3-media1.fl.yelpcdn.com/photo/fa8I6yUxyp6B4AOR1ABGLA/o.jpg',
                       'name': 'Jo M.',
                       'profile_url': 'https://www.yelp.com/user_details?userid=CKBLVYZi8len88s1meKIoA'}},
             {'id': 'hr4C7vsahxkie

In [16]:
# Parse Response
reviews = response['reviews']


# From the Request
business_id = store_to_update['business_id']
business_name = store_to_update['name']

# Create dict
dictionary = {'review_id' : [],
              'user_id': [],
              'name': [],
              'business_id':[],
              'Platform':[],
              'rating':[],
              'date': [],
              'text':[],
              'comment':[],
}

for review in reviews:
    dictionary['review_id'].append(review['id'])
    dictionary['user_id'].append(review['user']['id'])
    dictionary['name'].append(business_name)
    dictionary['business_id'].append(business_id)
    dictionary['date'].append(review['time_created'])
    dictionary['Platform'].append('Yelp')
    dictionary['rating'].append(review['rating'])
    dictionary['text'].append(review['text'])
    dictionary['comment'].append('')


df_yelp_api = pd.DataFrame(dictionary)

In [17]:
df_yelp_api.head()

Unnamed: 0,review_id,user_id,name,business_id,Platform,rating,date,text,comment
0,2l02e-2QAYZnF0tbZz5a-A,CKBLVYZi8len88s1meKIoA,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,Yelp,5,2022-07-25 09:43:43,Made a quick trip to Philly to get some eats! ...,
1,hr4C7vsahxkieDJ9tqtm0A,-6GY04bTPM2Zo4z0GN4a1A,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,Yelp,5,2021-11-01 11:22:07,The crispy roast pork is SO GOOD and lowkey it...,
2,khVt8RKpraoAwJg_fMjhIw,UHyquwvf_mI98eNsbIZbng,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,Yelp,5,2021-09-26 11:36:55,The breads were SUPER SOFT. The egg custard in...,


REMAINS TO UPDATE THE CASSANDRA ROW!

In [1]:
!pip install cassandra-driver

Collecting cassandra-driver

You should consider upgrading via the 'C:\Users\Marco\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.



  Downloading cassandra_driver-3.28.0-cp310-cp310-win_amd64.whl (2.8 MB)
     ---------------------------------------- 2.8/2.8 MB 3.3 MB/s eta 0:00:00
Collecting geomet<0.3,>=0.1
  Downloading geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Installing collected packages: geomet, cassandra-driver
Successfully installed cassandra-driver-3.28.0 geomet-0.2.1.post1


In [1]:
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement, dict_factory

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

cluster = Cluster(['186.87.6.161'], port='9042', protocol_version = 5) #IP del servidor y el puerto estandar de cassandra 9042
session = cluster.connect('henry')
session.row_factory = dict_factory

In [9]:
# Create historical Tracking Table

session.execute(
    """
    CREATE TABLE IF NOT EXISTS stores_Updates (
        business_id text,
        date_update text,
        updated boolean,
        PRIMARY KEY ((business_id))
        );
    """
)

<cassandra.cluster.ResultSet at 0x201c9caead0>

In [10]:
for row in session.execute("describe tables;"):
  print(row)

{'keyspace_name': 'henry', 'type': 'table', 'name': 'business'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'reviews'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'reviews_yelp'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'stores'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'stores2'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'stores_updates'}
{'keyspace_name': 'henry', 'type': 'table', 'name': 'tips'}


In [73]:
# Load some data into stores_updates
id_list = []
states = ['NV','TX','CA','FL','NY']
query = """ 
        SELECT * 
        FROM business
        LIMIT 20
        """
for row in session.execute(query):
  if row['state'] in states:
        print(row['categories'])
        id_list.append(row['business_id'])

id_list

SortedSet([' ', "'", ',', 'A', 'C', 'F', 'J', 'S', 'W', 'a', 'c', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'w', 'y'])
SortedSet([' ', ',', 'F', 'M', 'R', 'S', 'a', 'c', 'd', 'e', 'f', 'i', 'k', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'y'])
SortedSet([' ', ',', 'B', 'N', 'P', 'a', 'b', 'e', 'f', 'g', 'h', 'i', 'l', 'r', 's', 't', 'u'])


['nx8a-0rWqmkQDtYtXK0nhg', 'y0C_5eo2xI5vwVfeAIK8fg', 'pllHVuJtx8aM_RhUZ1MpNg']

In [17]:
# Load the three ids into stores_update

from datetime import date

current_date = date.today()
current_date = str(current_date)

statement = "INSERT INTO stores_updates(business_id, date_update, updated) VALUES (?,?,?)"
insert = session.prepare(statement)


for id_item in id_list:
    session.execute(insert, (id_item, current_date, True))

In [18]:
# Checking the Insert
for row in session.execute("SELECT * FROM stores_updates",):
  print(row)

{'business_id': 'nx8a-0rWqmkQDtYtXK0nhg', 'date_update': '2023-08-23', 'updated': True}
{'business_id': 'y0C_5eo2xI5vwVfeAIK8fg', 'date_update': '2023-08-23', 'updated': True}
{'business_id': 'pllHVuJtx8aM_RhUZ1MpNg', 'date_update': '2023-08-23', 'updated': True}


In [69]:
"""
Now I have to test that I can make a query getting Business_ID that:
1. Are not inserted in stores_updates
2. Are Inserted but have a False condition on 'Updated'
"""
# Initialize id_list
id_list = []

# Query the Table business
states = ['NV','TX','CA','FL','NY']

# Construct the query with the updated id_list
query = f""" 
        SELECT business_id,state 
        FROM business
        """

statement = SimpleStatement(query, fetch_size = 5000)
business_table_ids = session.execute(statement, timeout=None)

# Get information from stores_updates

dictionary = {'business_id' : [], 'updated':[] }
query = f""" 
        SELECT business_id,updated 
        FROM stores_updates
        """

statement = SimpleStatement(query, fetch_size = 5000)
updates_table = session.execute(statement, timeout=None)

for row in updates_table:
    dictionary['business_id'].append(row['business_id'])
    dictionary['updated'].append(row['updated'])

df_updates = pd.DataFrame(dictionary)


for row in business_table_ids:
    if row['state'] in states:
        # Check in stores_updates
        if row['business_id'] not in list(df_updates['business_id']):
            id_list.append(row['business_id'])
        else:
            # Check if the store is updated
            cond = df_updates[df_updates['business_id'] == str(row['business_id'])]['updated']
            try:
                if (list(cond)[0]) == False:
                        id_list.append(row['business_id'])
            except:
                print((list(cond)[0]))

        
    if len(id_list) >= 10:
        break


['AiFYkNiVKa98FQidK0REyQ', 'p_HeJssTSDwiT4K261H_kw', 'ndUuRwcL2wYBOXXgLTJyEQ', 'rlnPCCbQOwGeO_fipq68TA', 'eC4QcCTdKtIT16Fh14jnfw', 'ui9XXukBp4SAjVnCEuiUFw', '0CLCzdedGT2DPjkYM52Tqg', 'QyuF5F7cj02WXIF2nOgzBg', '_GM64s10iPSFiieLrIEhXA', 'aENSv8fq4oFZ92Ysbt08aA']


In [70]:
def load_update_reviews():
    """
    Now I have to query to get Business_ID that:
    1. Are not inserted in stores_updates
    2. Are Inserted but have a False condition on 'Updated'
    """
    # Initialize id_list
    id_list = []

    # Query the Table business
    states = ['NV','TX','CA','FL','NY']

    # Construct the query with the updated id_list
    query = f""" 
            SELECT business_id,state 
            FROM business
            """

    statement = SimpleStatement(query, fetch_size = 5000)
    business_table_ids = session.execute(statement, timeout=None)

    # Get information from stores_updates

    dictionary = {'business_id' : [], 'updated':[] }
    query = f""" 
            SELECT business_id,updated 
            FROM stores_updates
            """

    statement = SimpleStatement(query, fetch_size = 5000)
    updates_table = session.execute(statement, timeout=None)

    for row in updates_table:
        dictionary['business_id'].append(row['business_id'])
        dictionary['updated'].append(row['updated'])

    df_updates = pd.DataFrame(dictionary)


    for row in business_table_ids:
        if row['state'] in states:
            # Check in stores_updates
            if row['business_id'] not in list(df_updates['business_id']):
                id_list.append(row['business_id'])
            else:
                # Check if the store is updated
                cond = df_updates[df_updates['business_id'] == str(row['business_id'])]['updated']
                try:
                    if (list(cond)[0]) == False:
                            id_list.append(row['business_id'])
                except:
                    print((list(cond)[0]))

            
        if len(id_list) >= 10:
            break


    credits = 10
    for i in range(credits):
        print(id_list[i])
        #get_new_reviews()

    return None

In [71]:
load_update_reviews()

AiFYkNiVKa98FQidK0REyQ
p_HeJssTSDwiT4K261H_kw
ndUuRwcL2wYBOXXgLTJyEQ
rlnPCCbQOwGeO_fipq68TA
eC4QcCTdKtIT16Fh14jnfw
ui9XXukBp4SAjVnCEuiUFw
0CLCzdedGT2DPjkYM52Tqg
QyuF5F7cj02WXIF2nOgzBg
_GM64s10iPSFiieLrIEhXA
aENSv8fq4oFZ92Ysbt08aA


In [72]:
from credentials import yelp_api_key 

#def get_new_reviews(API_Key = yelp_api_key, id_to_query):

API_Key = yelp_api_key
id_to_query = 'AiFYkNiVKa98FQidK0REyQ'
# 1. Get the business name to update
query = f""" 
            SELECT name 
            FROM business
            WHERE business_id = '{id_to_query}'
            """

statement = SimpleStatement(query, fetch_size = 5000)
answer = session.execute(statement, timeout=None)

for row in answer:
    business_name = row['name']
    print(business_name)
"""
# 2. Create Request
# Get the Store ID 
business_id = store_to_update['business_id']
business_name = store_to_update['name']

# Create Request
yelp_api = YelpAPI(API_Key)
response = yelp_api.reviews_query(id=business_id, limit = 50)

# 3. Create DF
# Parse Response
reviews = response['reviews']

# Create dict
dictionary = {'review_id' : [],
            'user_id': [],
            'name': [],
            'business_id':[],
            'Platform':[],
            'rating':[],
            'date': [],
            'text':[],
            'comment':[],
}

for review in reviews:
    dictionary['review_id'].append(review['id'])
    dictionary['user_id'].append(review['user']['id'])
    dictionary['name'].append(business_name)
    dictionary['business_id'].append(business_id)
    dictionary['date'].append(review['time_created'])
    dictionary['Platform'].append('Yelp')
    dictionary['rating'].append(review['rating'])
    dictionary['text'].append(review['text'])
    dictionary['comment'].append('')


df_yelp_api = pd.DataFrame(dictionary)

# 4. LOAD df_yelp_api into CASSANDRA

# 5. UPDATE row in CASSANDRA


return 
"""

Verizon Authorized Retailer - GoWireless


"\n# 2. Create Request\n# Get the Store ID \nbusiness_id = store_to_update['business_id']\nbusiness_name = store_to_update['name']\n\n# Create Request\nyelp_api = YelpAPI(API_Key)\nresponse = yelp_api.reviews_query(id=business_id, limit = 50)\n\n# 3. Create DF\n# Parse Response\nreviews = response['reviews']\n\n# Create dict\ndictionary = {'review_id' : [],\n            'user_id': [],\n            'name': [],\n            'business_id':[],\n            'Platform':[],\n            'rating':[],\n            'date': [],\n            'text':[],\n            'comment':[],\n}\n\nfor review in reviews:\n    dictionary['review_id'].append(review['id'])\n    dictionary['user_id'].append(review['user']['id'])\n    dictionary['name'].append(business_name)\n    dictionary['business_id'].append(business_id)\n    dictionary['date'].append(review['time_created'])\n    dictionary['Platform'].append('Yelp')\n    dictionary['rating'].append(review['rating'])\n    dictionary['text'].append(review['text']

In [None]:
# All in one

from credentials import yelp_api_key 

def get_new_reviews(API_Key = yelp_api_key):

    # 1. Get the row to update

    # In Pandas
    #store_to_update = df_stores_food.loc[df_stores_food['updated'] == False].iloc[0]

    # From Cassandra query
    #...


    # 2. Create Request
    # Get the Store ID 
    business_id = store_to_update['business_id']
    business_name = store_to_update['name']

    # Create Request
    yelp_api = YelpAPI(API_Key)
    response = yelp_api.reviews_query(id=business_id, limit = 50)

    # 3. Create DF
    # Parse Response
    reviews = response['reviews']

    # Create dict
    dictionary = {'review_id' : [],
                'user_id': [],
                'name': [],
                'business_id':[],
                'Platform':[],
                'rating':[],
                'date': [],
                'text':[],
                'comment':[],
    }

    for review in reviews:
        dictionary['review_id'].append(review['id'])
        dictionary['user_id'].append(review['user']['id'])
        dictionary['name'].append(business_name)
        dictionary['business_id'].append(business_id)
        dictionary['date'].append(review['time_created'])
        dictionary['Platform'].append('Yelp')
        dictionary['rating'].append(review['rating'])
        dictionary['text'].append(review['text'])
        dictionary['comment'].append('')


    df_yelp_api = pd.DataFrame(dictionary)

    # 4. LOAD df_yelp_api into CASSANDRA

    # 5. UPDATE row in CASSANDRA


    return 


# We can even iterate the above method several times

def load_update_reviews():

    credits = 0

    for i in range(credits):

        get_new_reviews()

    return

### Generate TRENDY table

My idea is to generate a New Table related to Stores, where we store a historical data related to the trendy stores. 
A super basic information regarding this table would be to store the Store ID (poiting to the Store row) and the date when the store was found trendy.



In [21]:
response = yelp_api.search_query(term='restaurant,cafe',
                                 location='fl',
                                 attributes = 'hot_and_new',
                                 sort_by='rating',
                                 limit=1)

In [22]:
response

{'businesses': [{'id': 'xLzSMWRk37mVOk49QnQ4kA',
   'alias': 'sweet-paris-crêperie-and-café-coral-gables',
   'name': 'Sweet Paris Crêperie & Café',
   'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/1W9zHekaQce5nyrSr6BUxg/o.jpg',
   'is_closed': False,
   'url': 'https://www.yelp.com/biz/sweet-paris-cr%C3%AAperie-and-caf%C3%A9-coral-gables?adjust_creative=9XlpwIp1qnfNvM0eFgnFHw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9XlpwIp1qnfNvM0eFgnFHw',
   'review_count': 9,
   'categories': [{'alias': 'creperies', 'title': 'Creperies'},
    {'alias': 'waffles', 'title': 'Waffles'},
    {'alias': 'cafes', 'title': 'Cafes'}],
   'rating': 5.0,
   'coordinates': {'latitude': 25.7450994, 'longitude': -80.2581458},
   'transactions': [],
   'price': '$$',
   'location': {'address1': '3005 Ponce De Leon',
    'address2': 'Ste 142',
    'address3': '',
    'city': 'Coral Gables',
    'zip_code': '33134',
    'country': 'US',
    'state': 'FL',
    'display_address': 

In [23]:
trendy_stores = response['businesses']

trendy_stores

[{'id': 'xLzSMWRk37mVOk49QnQ4kA',
  'alias': 'sweet-paris-crêperie-and-café-coral-gables',
  'name': 'Sweet Paris Crêperie & Café',
  'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/1W9zHekaQce5nyrSr6BUxg/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/sweet-paris-cr%C3%AAperie-and-caf%C3%A9-coral-gables?adjust_creative=9XlpwIp1qnfNvM0eFgnFHw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9XlpwIp1qnfNvM0eFgnFHw',
  'review_count': 9,
  'categories': [{'alias': 'creperies', 'title': 'Creperies'},
   {'alias': 'waffles', 'title': 'Waffles'},
   {'alias': 'cafes', 'title': 'Cafes'}],
  'rating': 5.0,
  'coordinates': {'latitude': 25.7450994, 'longitude': -80.2581458},
  'transactions': [],
  'price': '$$',
  'location': {'address1': '3005 Ponce De Leon',
   'address2': 'Ste 142',
   'address3': '',
   'city': 'Coral Gables',
   'zip_code': '33134',
   'country': 'US',
   'state': 'FL',
   'display_address': ['3005 Ponce De Leon',
    'Ste 142'

In [24]:
# Checking if the store already exists in the db or it's new
value_to_check = trendy_stores[0]['id']
column_name = 'business_id'

value_exists = value_to_check in df_stores_food[column_name].values
if value_exists:
    print("The value exists in the DataFrame.")
else:
    print(" It's a new store.")

 It's a new store.


If the store is new we have to insert it in the DB

In [28]:
# Get the business info



# Create dict
dictionary = {'business_id' : [],
            'name': [],
            'state': [],
            'latitude':[],
            'longitude':[],
            'rating':[],
            'review_count': [],
            'attributes':[],
            'categories':[],
            'hours':[],
            'address_complete':[],
            'updated':[],

}

for store in trendy_stores:
    dictionary['business_id'].append(store['id'])
    dictionary['name'].append(store['name'])
    dictionary['state'].append(store['location']['state'])
    dictionary['latitude'].append(store['coordinates']['latitude'])
    dictionary['longitude'].append(store['coordinates']['longitude'])
    dictionary['rating'].append(store['rating'])
    dictionary['review_count'].append(store['review_count'])
    dictionary['attributes'].append(store['transactions'])
    dictionary['categories'].append(store['categories'])
    dictionary['hours'].append('')
    dictionary['updated'].append(False)

    address_complete =  store['location']['address1'] + ', ' + store['location']['city'] + ', ' + store['location']['state'] + ' ' + store['location']['zip_code']
    dictionary['address_complete'].append(address_complete)



df_trendy = pd.DataFrame(dictionary)	

df_trendy

Unnamed: 0,business_id,name,state,latitude,longitude,rating,review_count,attributes,categories,hours,address_complete,updated
0,xLzSMWRk37mVOk49QnQ4kA,Sweet Paris Crêperie & Café,FL,25.745099,-80.258146,5.0,9,[],"[{'alias': 'creperies', 'title': 'Creperies'},...",,"3005 Ponce De Leon, Coral Gables, FL 33134",False


In [29]:
# Now create the Table of Historical trends

from datetime import date

current_date = date.today()

dictionary = {  'business_id' : [],
                'date': []
            }

for store in trendy_stores:
    dictionary['business_id'].append(store['id'])
    dictionary['date'].append(current_date)


df_trend_history = pd.DataFrame(dictionary)

df_trend_history

Unnamed: 0,business_id,date
0,xLzSMWRk37mVOk49QnQ4kA,2023-08-21


In [None]:
# All in one

from credentials import yelp_api_key 
from datetime import date

current_date = date.today()
states = ['NV','TX','CA','FL','NY']

def get_trends(API_Key = yelp_api_key):
    
    # Create Request
    yelp_api = YelpAPI(API_Key)

    for state in states:

        response = yelp_api.search_query(   term='restaurant,cafe',
                                            location=state,
                                            attributes = 'hot_and_new',
                                            sort_by='rating',
                                            )
    
        trendy_stores = response['businesses']
        
        # Check Business in DB

        # Create dict
        dictionary_new = {  'business_id' : [],
                            'name': [],
                            'state': [],
                            'latitude':[],
                            'longitude':[],
                            'rating':[],
                            'review_count': [],
                            'attributes':[],
                            'categories':[],
                            'hours':[],
                            'address_complete':[],
                            'updated':[],
                            }
        
        dictionary_trend = {    'business_id' : [],
                                'date': []
                            }

        # Iterate the responses
        for store in trendy_stores:
            value_to_check = store['id']
            column_name = 'business_id'

            # CHECK WITH CASSANDRA
            #value_exists = value_to_check in df_stores_food[column_name].values
            if value_exists == False:
                
                
                dictionary_new['business_id'].append(store['id'])
                dictionary_new['name'].append(store['name'])
                dictionary_new['state'].append(store['location']['state'])
                dictionary_new['latitude'].append(store['coordinates']['latitude'])
                dictionary_new['longitude'].append(store['coordinates']['longitude'])
                dictionary_new['rating'].append(store['rating'])
                dictionary_new['review_count'].append(store['review_count'])
                dictionary_new['attributes'].append(store['transactions'])
                dictionary_new['categories'].append(store['categories'])
                dictionary_new['hours'].append('')
                dictionary_new['updated'].append(False)
                address_complete =  store['location']['address1'] + ', ' + store['location']['city'] + ', ' + store['location']['state'] + ' ' + store['location']['zip_code']
                dictionary_new['address_complete'].append(address_complete)

            
            dictionary_trend['business_id'].append(store['id'])
            dictionary_trend['date'].append(current_date)



        # Load data 
        df_trendy = pd.DataFrame(dictionary_new)
        df_trend_history = pd.DataFrame(dictionary_trend)

        # Load into CASSANDRA


        # end for states

    return



import credentials