In [33]:
# !pip install -U textblob

In [38]:
# For EDA now, will add more libraries as we progress
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from textblob import TextBlob

# Set nice style for plots
sns.set_theme(style='darkgrid')
sns.dark_palette("#69d", reverse=True, as_cmap=True)
sns.set_context("paper")
import json



# 1. Read Data

Our data was taken from this database of [Google Local Data 2021](https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/#subsets). We previously used a smaller dataset of Google Restaurant reviews available [here](https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews), with only 1100 reviews, but as per our TF's recommendation, we will be using this larger dataset with over 10m reviews for Massachusetts alone. To speed up our EDA, we will be using a sample of 100,000 reviews.

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

Our main dataset, stored in the `df` variable, contains 100,000 reviews. Our columns of interest are `rating`, which is the rating given by the user, `text`, which is the review text, and `gmap_id`, which is the Google Maps ID of the business. 

We also have a metadata dataset, stored in the `df_meta` variable, which contains information about each business in Massachusetts. The columns of interest are `gmap_id`, allowing us to join this dataset with the main one, `name`, which is the name of the business, and `description`, which is a brief description of the business. There is also a variable `category` that organizes the businesses into sectors such as non-profits, gyms, restaurants, etc. 

In [39]:
# Load the data
# Convert to dataframe
data_list = []
with open('data/review-Massachusetts.json', 'r') as f:
    for line in f:
        try:
            json_obj = json.loads(line)
            data_list.append(json_obj)
        except json.JSONDecodeError:
            print(f"Error decoding the following JSON line: {line}")

df = pd.DataFrame(data_list)

In [40]:
# Examine df
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10447007 entries, 0 to 10447006
Data columns (total 8 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  object 
 1   name     object 
 2   time     int64  
 3   rating   float64
 4   text     object 
 5   pics     object 
 6   resp     object 
 7   gmap_id  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 637.6+ MB
None
                 user_id                 name           time  rating  \
0  108990132658983842962         Sherri Mayne  1559246854759     5.0   
1  100425369457528701350       Adam Goodspeed  1557586283214     5.0   
2  118197855649932344021  Christopher Sheehan  1568756947956     4.0   
3  106341493635432341037       Katie Sullivan  1627234799114     5.0   
4  102472657465934208615   Victoria Henderson  1619237055452     5.0   

                                                text  pics  resp  \
0  I love the people that live there, they ate th...  None  None   
1  Stop parking in the resid

In [41]:
# Also load the whole meta-Massachusetts.json file. This contains metadata about the businesses, 
# since the reviews only contain the business id.
with open('data/meta-Massachusetts.json', 'r') as f:
    data_meta = f.readlines()

data_meta_str = "[" + ','.join(data_meta) + "]"
df_meta = pd.read_json(StringIO(data_meta_str))

# Filter dataframe so that it only contains restaurants
df_meta = df_meta[df_meta['category'].apply(lambda x: isinstance(x, list) and any('restaurant' in category.lower() for category in x) if x is not None else False)]

In [42]:
# Examine df
print(df_meta.info())
print(df_meta.head())

<class 'pandas.core.frame.DataFrame'>
Index: 16079 entries, 14 to 92514
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              16079 non-null  object 
 1   address           16055 non-null  object 
 2   gmap_id           16079 non-null  object 
 3   description       9341 non-null   object 
 4   latitude          16079 non-null  float64
 5   longitude         16079 non-null  float64
 6   category          16079 non-null  object 
 7   avg_rating        16079 non-null  float64
 8   num_of_reviews    16079 non-null  int64  
 9   price             11987 non-null  object 
 10  hours             15127 non-null  object 
 11  MISC              15994 non-null  object 
 12  state             11728 non-null  object 
 13  relative_results  15090 non-null  object 
 14  url               16079 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 2.0+ MB
None
                            

In [43]:
# Merging the dataframes on 'gmap_id'
# 'inner' will only include rows that have matching 'gmap_id' in both dataframes
df_combined = pd.merge(df, df_meta, on='gmap_id', how='inner')

In [44]:
# Examine the combined dataframe
print(df_combined.info())
print(df_combined.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4182567 entries, 0 to 4182566
Data columns (total 22 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   name_x            object 
 2   time              int64  
 3   rating            float64
 4   text              object 
 5   pics              object 
 6   resp              object 
 7   gmap_id           object 
 8   name_y            object 
 9   address           object 
 10  description       object 
 11  latitude          float64
 12  longitude         float64
 13  category          object 
 14  avg_rating        float64
 15  num_of_reviews    int64  
 16  price             object 
 17  hours             object 
 18  MISC              object 
 19  state             object 
 20  relative_results  object 
 21  url               object 
dtypes: float64(4), int64(2), object(16)
memory usage: 702.0+ MB
None
                 user_id      name_x           time  rating  \
0  1053245

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

We merged the main dataset and our meta dataset based on `gmap_id` and filtered out the ones of restaurants.

We will clean up the dataframe for our purpose. From below we see that we can probably get rid of these columns: `name_x` (since we don't care about the name of the reviewer, having `user_id` is enough to identify them; `time` (for now we don't care about when the reviewers wrote the reviews); `pics` (we don't care about pictures first, but might use it in the future if time permits); `resp` (most of them are `None`); `gmap_id`(we can identify restaurants based on their names which is easier to understand); `description` (most of them are `None`); `relative_results`; `url`)

We see that there are duplicates in our merged dataset. We will drop the duplicates after we clean up the dataframe to only keep necessary columns.

In [45]:
df_combined.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,105324587117440371682,Jessica,1515819902193,4.0,What a great experience. I tried the chicken t...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89e3169821e62d4d:0x14ff0683c1ebca0e,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",...,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89e31418f27b6a29:0x2fd2e82a6f96214c, 0x89e3...",https://www.google.com/maps/place//data=!4m2!3...
1,105324587117440371682,Jessica,1515819902193,4.0,What a great experience. I tried the chicken t...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89e3169821e62d4d:0x14ff0683c1ebca0e,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",...,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89e31418f27b6a29:0x2fd2e82a6f96214c, 0x89e3...",https://www.google.com/maps/place//data=!4m2!3...
2,111593292201397736581,Jonah Ford,1526075323967,5.0,I've probably driven past this place a million...,,,0x89e3169821e62d4d:0x14ff0683c1ebca0e,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",...,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89e31418f27b6a29:0x2fd2e82a6f96214c, 0x89e3...",https://www.google.com/maps/place//data=!4m2!3...
3,111593292201397736581,Jonah Ford,1526075323967,5.0,I've probably driven past this place a million...,,,0x89e3169821e62d4d:0x14ff0683c1ebca0e,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",...,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89e31418f27b6a29:0x2fd2e82a6f96214c, 0x89e3...",https://www.google.com/maps/place//data=!4m2!3...
4,104512515767692446991,Zak Bug,1497983845749,5.0,The food was fantastic. The staff was very fr...,,,0x89e3169821e62d4d:0x14ff0683c1ebca0e,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",...,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed,"[0x89e31418f27b6a29:0x2fd2e82a6f96214c, 0x89e3...",https://www.google.com/maps/place//data=!4m2!3...


In [49]:
df_cleaned = df_combined.drop_duplicates(subset=['user_id', 'gmap_id'])

columns_to_drop = ['name_x', 'time', 'pics', 'resp', 'gmap_id', 'description', 'relative_results', 'url']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

# Drop duplicates based on `user_id` and `gmap_id` to ensure unique reviews per user per restaurant

df_cleaned.head()

Unnamed: 0,user_id,rating,text,name_y,address,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state
0,105324587117440371682,4.0,What a great experience. I tried the chicken t...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed
2,111593292201397736581,5.0,I've probably driven past this place a million...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed
4,104512515767692446991,5.0,The food was fantastic. The staff was very fr...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed
6,108597457197204926945,5.0,"The owner, T, is a wonderful man. Super friend...",Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed
8,118226309970656279914,5.0,Just stopped in for a couple of grilled chicke...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48,$$,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","{'Service options': ['Takeout', 'Delivery'], '...",Permanently closed


In [50]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4135196 entries, 0 to 4182566
Data columns (total 14 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   rating          float64
 2   text            object 
 3   name_y          object 
 4   address         object 
 5   latitude        float64
 6   longitude       float64
 7   category        object 
 8   avg_rating      float64
 9   num_of_reviews  int64  
 10  price           object 
 11  hours           object 
 12  MISC            object 
 13  state           object 
dtypes: float64(4), int64(1), object(9)
memory usage: 473.2+ MB


<div style="background-color:#3F7FBF; color:white; padding:10px"> 

The `MISC` column contains dictionaries with useful information. We decided to further process this column and extract the attributes out as new columns of our dataframe.

In [None]:
print(df_cleaned.loc[0, 'MISC'])

{'Service options': ['Delivery', 'Takeout', 'Dine-in'], 'Health & safety': ['Mask required'], 'Popular for': ['Lunch', 'Dinner', 'Solo dining'], 'Accessibility': ['Wheelchair accessible entrance'], 'Offerings': ['Comfort food', 'Healthy options', 'Late-night food', 'Quick bite', 'Small plates', 'Vegetarian options'], 'Dining options': ['Dessert'], 'Amenities': ['Good for kids'], 'Atmosphere': ['Casual', 'Cozy'], 'Crowd': ['Groups']}


In [51]:
# First, ensure all entries in 'MISC' are dictionaries; replace None with empty dictionaries
df_cleaned['MISC'] = df_cleaned['MISC'].apply(lambda x: x if isinstance(x, dict) else {})

# Convert the 'MISC' column to a DataFrame where each key in the dictionary becomes a column
misc_expanded = pd.DataFrame(df_cleaned['MISC'].tolist())

In [52]:
# Join the expanded 'MISC' DataFrame with the original 'df_cleaned', excluding the 'MISC' column
df_expanded = pd.concat([df_cleaned.drop(columns=['MISC']), misc_expanded], axis=1)

In [53]:
df_expanded.head()

Unnamed: 0,user_id,rating,text,name_y,address,latitude,longitude,category,avg_rating,num_of_reviews,...,Payments,Highlights,Popular for,Amenities,Atmosphere,Health & safety,Crowd,Planning,From the business,Health and safety
0,105324587117440371682,4.0,What a great experience. I tried the chicken t...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,...,[Debit cards],,,,,,,,,
2,111593292201397736581,5.0,I've probably driven past this place a million...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,...,[Debit cards],,,,,,,,,
4,104512515767692446991,5.0,The food was fantastic. The staff was very fr...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,...,[Debit cards],,,,,,,,,
6,108597457197204926945,5.0,"The owner, T, is a wonderful man. Super friend...",Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,...,[Debit cards],,,,,,,,,
8,118226309970656279914,5.0,Just stopped in for a couple of grilled chicke...,Three Star Pizza,"Three Star Pizza, 409 Cabot St #1, Beverly, MA...",42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,...,[Debit cards],,,,,,,,,


In [54]:
df_expanded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4182567 entries, 0 to 3957456
Data columns (total 27 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            object 
 1   rating             float64
 2   text               object 
 3   name_y             object 
 4   address            object 
 5   latitude           float64
 6   longitude          float64
 7   category           object 
 8   avg_rating         float64
 9   num_of_reviews     float64
 10  price              object 
 11  hours              object 
 12  state              object 
 13  Service options    object 
 14  Accessibility      object 
 15  Offerings          object 
 16  Dining options     object 
 17  Payments           object 
 18  Highlights         object 
 19  Popular for        object 
 20  Amenities          object 
 21  Atmosphere         object 
 22  Health & safety    object 
 23  Crowd              object 
 24  Planning           object 
 25  From the business  obje

In [55]:
columns_of_interest = [
    "Service options", "Accessibility", "Offerings", "Dining options",
    "Payments", "Highlights", "Popular for", "Amenities",
    "Atmosphere", "Health & safety", "Crowd", "Planning",
    "From the business", "Health and safety"
]

# Initialize a dictionary to hold the unique values for each column
unique_values = {}

# Iterate over each column of interest
for column in columns_of_interest:
    # Extract the column's data
    column_data = df_expanded[column]
    
    # Since the data might contain lists, we need to flatten these into a single list before finding unique values
    # We'll use a set to automatically keep only unique items
    flattened_set = set()
    for item in column_data.dropna():  # Drop NA values to avoid errors
        if isinstance(item, list):  # Check if the item is a list
            flattened_set.update(item)  # Add all items in the list to the set
        else:
            flattened_set.add(item)  # Add the item itself if it's not a list
    
    # Store the unique values for this column in our dictionary
    unique_values[column] = list(flattened_set)  # Convert the set back to a list for readability

# Or, to print the unique values for all columns of interest:
for column, values in unique_values.items():
    print(f"{column}: {values}\n")

Service options: ['Dine-in', 'In-store pickup', 'In-store pick-up', 'Outdoor seating', 'In-store shopping', 'Curbside pickup', 'Online appointments', 'Takeaway', 'Delivery', 'Same-day delivery', 'Drive-through', 'Takeout', 'No-contact delivery']

Accessibility: ['Wheelchair accessible elevator', 'Wheelchair-accessible entrance', 'Wheelchair accessible entrance', 'Wheelchair-accessible toilet', 'Wheelchair-accessible seating', 'Wheelchair accessible parking lot', 'Wheelchair-accessible lift', 'Wheelchair accessible restroom', 'Wheelchair accessible seating', 'Assisted listening devices', 'Wheelchair-accessible car park', 'Wheelchair rental']

Offerings: ['Full service gas', 'Ethanol-free gas', 'Cocktails', 'Dancing', 'Braille menu', 'Halal food', 'Hard liquor', 'Spirits', 'Wine', 'Happy hour food', 'Prepared foods', 'Car wash', 'Salad bar', 'Late-night food', 'Vegetarian options', 'Small plates', 'Coffee', 'Organic dishes', 'Alcohol', 'Beer', "Kids' menu", 'Food at bar', 'Service guaran

In [None]:
# Counting NaN values in each column of df_expanded
nan_counts = df_expanded.isna().sum()

# Displaying the count of NaN values per column
print(nan_counts)

user_id                47393
rating                 47393
text                 1939755
gmap_id                47371
name_y                 47371
address                48769
latitude               47371
longitude              47371
category               47371
avg_rating             47371
num_of_reviews         47371
price                 432281
hours                 156138
state                1891583
Service options       148804
Accessibility         412830
Offerings             359532
Dining options        688883
Payments             1605942
Highlights           1359587
Popular for           710651
Amenities             444311
Atmosphere            629354
Health & safety      1675107
Crowd                 666088
Planning             2167329
From the business    4008956
Health and safety    4115234
dtype: int64


<div style="background-color:#3F7FBF; color:white; padding:10px"> 
Since we will focus on reviews, we decided to drop rows with `text` (which is the review column) empty. We will also drop the rows with `rating` empty.

In [56]:
# Drop rows where the 'text' column is NaN
df_expanded_clean = df_expanded.dropna(subset=['text', 'rating'])
df_expanded_clean.isna().sum()

user_id                    0
rating                     0
text                       0
name_y                     0
address                  876
latitude                   0
longitude                  0
category                   0
avg_rating                 0
num_of_reviews             0
price                 228738
hours                  60330
state                 972148
Service options        79723
Accessibility         225697
Offerings             193512
Dining options        373783
Payments              865841
Highlights            733829
Popular for           381422
Amenities             239374
Atmosphere            338562
Health & safety       901748
Crowd                 361884
Planning             1165269
From the business    2150707
Health and safety    2207346
dtype: int64

<div style="background-color:#3F7FBF; color:white; padding:10px"> 
    
Let's further inspect these columns and decide either to drop them or process them.

`Planning`, `From the business`, and `Health and safety` simply just have too many missing values, so we will drop them.

We decided to drop `address` column since we will be able to infer the address based on names of the restaurants and the latitude and longitude.

`state` describes the current states of the restraurants when the dataset authors scraped it. This is not that helpful for our analysis so we will drop this column too.

There are some other variables which are not important for our analysis for now. We will drop them as well and start thinking about how to process the rest (e.g. one-hot encode).

In [None]:
df_expanded_clean['state'].unique()

array(['Permanently closed', nan, 'Closes soon ⋅ 8:30PM ⋅ Opens 11AM Thu',
       'Closed ⋅ Opens 7AM Thu', 'Open ⋅ Closes 10PM',
       'Closes soon ⋅ 4PM ⋅ Opens 8AM Thu', 'Open ⋅ Closes 8PM',
       'Open ⋅ Closes 5PM', 'Closed ⋅ Opens 11AM',
       'Closed ⋅ Opens 11:30AM', 'Closed ⋅ Opens 5PM Thu',
       'Open ⋅ Closes 2AM', 'Closed ⋅ Opens 11AM Wed',
       'Closed ⋅ Opens 8AM Wed', 'Closed ⋅ Opens 8:30AM Wed',
       'Closed ⋅ Opens 6AM Wed', 'Closed ⋅ Opens 7:30AM Wed',
       'Open ⋅ Closes 8AM Wed', 'Opens soon ⋅ 9AM',
       'Closed ⋅ Opens 8:30AM', 'Closed ⋅ Opens 4PM Thu',
       'Closed ⋅ Opens 5AM', 'Closed ⋅ Opens 7:15AM',
       'Closes soon ⋅ 9PM ⋅ Opens 10AM Tue', 'Open 24 hours',
       'Open ⋅ Closes 9PM', 'Closed ⋅ Opens 4PM', 'Closed ⋅ Opens 7AM',
       'Closed ⋅ Opens 7:30AM Mon', 'Closed ⋅ Opens 5:30AM Mon',
       'Closed ⋅ Opens 3PM Tue', 'Closed ⋅ Opens 8:30AM Mon',
       'Closed ⋅ Opens 11AM Mon', 'Closed ⋅ Opens 12PM',
       'Closed ⋅ Opens 9AM', 'Clos

In [None]:
df_expanded_clean['price'].unique()

array(['$$', nan, '$', '$$$', '$$$$', '₩₩', '₩', '₩₩₩'], dtype=object)

In [None]:
df_expanded_clean['hours'].unique()

array(["[['Thursday', '11AM–8PM'], ['Friday', '11AM–10PM'], ['Saturday', '11AM–10PM'], ['Sunday', '4–10PM'], ['Monday', 'Closed'], ['Tuesday', '11AM–8PM'], ['Wednesday', '11AM–8PM']]",
       nan,
       "[['Wednesday', '11AM–8:30PM'], ['Thursday', '11AM–8:30PM'], ['Friday', '11AM–9PM'], ['Saturday', '11AM–9PM'], ['Sunday', '11AM–8:30AM'], ['Monday', '11AM–8:30AM'], ['Tuesday', '11AM–8:30PM']]",
       ...,
       "[['Friday', '11:30AM–8PM'], ['Saturday', '4–8PM'], ['Sunday', '4–8PM'], ['Monday', '11:30AM–8PM'], ['Tuesday', '11:30AM–8PM'], ['Wednesday', '11:30AM–8PM'], ['Thursday', '11:30AM–8PM']]",
       "[['Friday', '7AM–8PM'], ['Saturday', '7AM–8PM'], ['Sunday', '11AM–8PM'], ['Monday', '7AM–8PM'], ['Tuesday', '7AM–8PM'], ['Wednesday', '7AM–8PM'], ['Thursday', '7AM–8PM']]",
       "[['Thursday', 'Closed'], ['Friday', 'Closed'], ['Saturday', 'Closed'], ['Sunday', '3:35–3:36AM'], ['Monday', 'Closed'], ['Tuesday', 'Closed'], ['Wednesday', 'Closed']]"],
      dtype=object)

In [None]:
df_expanded_clean['Service options'].unique()

array(["['Takeout', 'Delivery']", "['Delivery']",
       "['No-contact delivery', 'Delivery', 'Takeout', 'Dine-in']",
       "['Delivery', 'Takeout', 'Dine-in']", "['Takeout', 'Dine-in']",
       "['In-store shopping', 'Takeout', 'Dine-in', 'Delivery']",
       "['Takeout', 'Dine-in', 'Delivery']",
       "['Drive-through', 'Takeout', 'Dine-in', 'Delivery']",
       "['Curbside pickup', 'In-store pickup', 'In-store shopping', 'Takeout', 'Dine-in', 'Delivery']",
       "['Outdoor seating', 'Delivery']", "['Delivery', 'Takeout']",
       "['Curbside pickup', 'Takeout', 'Dine-in', 'Delivery']",
       "['In-store shopping']",
       "['Outdoor seating', 'Delivery', 'Takeout', 'Dine-in']",
       "['Dine-in', 'Delivery']",
       "['Curbside pickup', 'No-contact delivery', 'Delivery', 'Takeout', 'Dine-in']",
       nan, "['In-store shopping', 'Delivery']",
       "['Outdoor seating', 'Curbside pickup', 'No-contact delivery', 'Delivery', 'Takeout', 'Dine-in']",
       "['Curbside pickup', '

In [None]:
df_expanded_clean['Accessibility'].unique()

array(["['Wheelchair accessible entrance']",
       "['Wheelchair accessible entrance', 'Wheelchair accessible restroom']",
       nan,
       "['Wheelchair accessible entrance', 'Wheelchair accessible parking lot', 'Wheelchair accessible seating']",
       "['Wheelchair accessible entrance', 'Wheelchair accessible seating']",
       "['Wheelchair accessible entrance', 'Wheelchair accessible parking lot', 'Wheelchair accessible restroom', 'Wheelchair accessible seating']",
       "['Wheelchair accessible entrance', 'Wheelchair accessible parking lot', 'Wheelchair accessible restroom']",
       "['Wheelchair accessible seating']",
       "['Wheelchair accessible elevator', 'Wheelchair accessible entrance', 'Wheelchair accessible parking lot', 'Wheelchair accessible restroom', 'Wheelchair accessible seating']",
       "['Wheelchair accessible restroom', 'Wheelchair accessible entrance']",
       "['Wheelchair accessible parking lot']",
       "['Wheelchair accessible parking lot', 'Wheel

In [None]:
df_expanded_clean['Offerings'].unique()

array(["['Comfort food', 'Late-night food', 'Vegetarian options']",
       "['Coffee', 'Comfort food', 'Healthy options', 'Organic dishes', 'Quick bite', 'Vegetarian options']",
       "['Coffee', 'Comfort food', 'Healthy options', 'Quick bite', 'Small plates', 'Vegetarian options']",
       ...,
       '[\'Alcohol\', \'All you can eat\', \'Beer\', \'Braille menu\', \'Cocktails\', \'Coffee\', \'Comfort food\', \'Happy hour drinks\', \'Happy hour food\', \'Hard liquor\', "Kids\' menu", \'Late-night food\', \'Organic dishes\', \'Small plates\', \'Vegetarian options\', \'Wine\']',
       '[\'Alcohol\', \'Coffee\', \'Comfort food\', \'Food\', \'Food at bar\', \'Happy hour food\', "Kids\' menu", \'Late-night food\', \'Quick bite\']',
       '[\'Coffee\', \'Halal food\', \'Healthy options\', "Kids\' menu", \'Late-night food\', \'Quick bite\', \'Vegetarian options\']'],
      dtype=object)

In [None]:
df_expanded_clean['Dining options'].unique()

array(["['Lunch', 'Dinner', 'Dessert']",
       "['Breakfast', 'Lunch', 'Dinner', 'Dessert']", "['Dessert']", nan,
       "['Breakfast', 'Lunch', 'Catering', 'Dessert', 'Seating']",
       "['Lunch', 'Dinner', 'Catering', 'Dessert', 'Seating']",
       "['Breakfast', 'Lunch', 'Dinner', 'Dessert', 'Seating']",
       "['Breakfast']", "['Breakfast', 'Lunch', 'Dessert']",
       "['Breakfast', 'Dessert']",
       "['Lunch', 'Dinner', 'Catering', 'Dessert']",
       "['Breakfast', 'Lunch', 'Dinner']", "['Seating']",
       "['Lunch', 'Dinner', 'Seating']",
       "['Breakfast', 'Lunch', 'Dinner', 'Catering', 'Dessert', 'Seating']",
       "['Lunch', 'Dinner']", "['Breakfast', 'Lunch']",
       "['Breakfast', 'Lunch', 'Dinner', 'Counter service', 'Dessert']",
       "['Lunch', 'Dinner', 'Catering']", "['Lunch', 'Catering']",
       "['Lunch', 'Dinner', 'Dessert', 'Seating']",
       "['Lunch', 'Catering', 'Dessert']",
       "['Lunch', 'Dinner', 'Catering', 'Seating']", "['Dinner']",
      

In [None]:
df_expanded_clean['Payments'].unique()

array(["['Debit cards']", nan, "['NFC mobile payments']",
       "['Debit cards', 'Credit cards']",
       "['Debit cards', 'NFC mobile payments', 'Credit cards']",
       "['Debit cards', 'NFC mobile payments']",
       "['Checks', 'Debit cards', 'Credit cards']", "['Credit cards']",
       "['Cash-only']", "['Cash-only', 'Debit cards']",
       "['Cash-only', 'Credit cards']",
       "['Cash-only', 'Debit cards', 'Credit cards']",
       "['Cash-only', 'Checks', 'Debit cards', 'NFC mobile payments']",
       "['Cash-only', 'Checks', 'Debit cards', 'Credit cards']",
       "['NFC mobile payments', 'Credit cards']",
       "['Checks', 'Debit cards']",
       "['Checks', 'Debit cards', 'NFC mobile payments']",
       "['Cash-only', 'NFC mobile payments']", "['Checks']",
       "['Cash-only', 'Debit cards', 'NFC mobile payments']",
       "['Checks', 'Debit cards', 'NFC mobile payments', 'Credit cards']",
       "['Cash-only', 'Debit cards', 'NFC mobile payments', 'Credit cards']",
     

In [None]:
df_expanded_clean['Highlights'].unique()

array([nan, "['Great coffee', 'Great dessert', 'Great tea selection']",
       "['Great coffee']", "['Fast service']",
       "['Great beer selection', 'Great cocktails']",
       "['Fast service', 'Great coffee', 'Great tea selection']",
       "['Great coffee', 'Great dessert']", "['Great dessert']",
       "['LGBTQ friendly', 'Transgender safespace']",
       "['Fast service', 'Great coffee']", "['LGBTQ friendly']",
       "['Fireplace', 'Great coffee']",
       "['Great beer selection', 'Great cocktails', 'LGBTQ friendly']",
       "['Great coffee', 'Great wine list']", "['Great cocktails']",
       "['Fireplace', 'Live music']",
       "['Great beer selection', 'Great cocktails', 'Great wine list']",
       "['Live music']", "['Great dessert', 'LGBTQ friendly']",
       "['Great cocktails', 'Live music']",
       "['Fast service', 'Great beer selection', 'Great coffee', 'Great dessert', 'Great wine list']",
       "['Great cocktails', 'Great wine list']",
       "['Great beer sele

In [None]:
df_expanded_clean['Popular for'].unique()

array([nan, "['Breakfast', 'Lunch', 'Dinner', 'Solo dining']",
       "['Lunch', 'Dinner', 'Solo dining']",
       "['Breakfast', 'Lunch', 'Solo dining']", "['Solo dining']",
       "['Dinner', 'Solo dining']", "['Lunch', 'Solo dining']",
       "['Breakfast', 'Solo dining']", "['Lunch', 'Dinner']",
       "['Breakfast', 'Dinner', 'Solo dining']",
       "['Breakfast', 'Lunch', 'Dinner']", "['Lunch']",
       "['Breakfast', 'Lunch']", "['Breakfast']", "['Dinner']",
       "['Breakfast', 'Solo dining', 'Good for working on laptop']",
       "['Breakfast', 'Lunch', 'Solo dining', 'Good for working on laptop']",
       "['Breakfast', 'Lunch', 'Dinner', 'Solo dining', 'Good for working on laptop']",
       "['Solo dining', 'Good for working on laptop']",
       "['Lunch', 'Dinner', 'Solo dining', 'Good for working on laptop']",
       "['Lunch', 'Solo dining', 'Good for working on laptop']"],
      dtype=object)

In [None]:
df_expanded_clean['Amenities'].unique()

array([nan, "['Good for kids', 'Restroom']", "['Good for kids']",
       "['Good for kids', 'High chairs', 'Restroom', 'Wi-Fi']",
       "['Gender-neutral restroom', 'Good for kids', 'Restroom']",
       "['Good for kids', 'High chairs']", "['Bar onsite', 'Restroom']",
       "['Good for kids', 'Restroom', 'Wi-Fi']",
       "['Bar onsite', 'Good for kids', 'High chairs']",
       "['Restroom', 'Wi-Fi']", "['Bar onsite']",
       "['Good for kids', 'High chairs', 'Restroom']",
       "['Public restroom', 'Restroom']",
       "['Gender-neutral restroom', 'Good for kids']",
       "['Bar onsite', 'Good for kids']",
       "['Good for kids', 'High chairs', 'Wi-Fi']",
       "['Bar onsite', 'High chairs', 'Restroom', 'Wi-Fi']", "['Wi-Fi']",
       "['Good for kids', 'Wi-Fi']",
       "['Gender-neutral restroom', 'Good for kids', 'High chairs', 'Restroom']",
       "['Bar onsite', 'High chairs']",
       "['Gender-neutral restroom', 'Good for kids', 'High chairs', 'Wi-Fi']",
       "['Bar on

In [None]:
df_expanded_clean['Atmosphere'].unique()

array([nan, "['Casual', 'Cozy']", "['Casual']",
       "['Casual', 'Cozy', 'Quiet']", "['Cozy']",
       "['Casual', 'Romantic']", "['Casual', 'Historic']",
       "['Casual', 'Trending']", "['Trending']", "['Casual', 'Quiet']",
       "['Cozy', 'Romantic']", "['Quiet', 'Romantic']", "['Romantic']",
       "['Cozy', 'Historic']", "['Casual', 'Cozy', 'Romantic']",
       "['Cozy', 'Romantic', 'Upscale']", "['Upscale']",
       "['Romantic', 'Upscale']", "['Casual', 'Cozy', 'Upscale']",
       "['Casual', 'Cozy', 'Historic']", "['Casual', 'Cosy']",
       "['Cosy', 'Romantic', 'Upmarket']",
       "['Cosy', 'Historic', 'Romantic', 'Upmarket']",
       "['Casual', 'Cozy', 'Trending']",
       "['Casual', 'Cozy', 'Romantic', 'Upscale']",
       "['Casual', 'Cosy', 'Historic']", "['Romantic', 'Upmarket']",
       "['Casual', 'Romantic', 'Trending', 'Upscale']",
       "['Casual', 'Cosy', 'Upmarket']",
       "['Cozy', 'Historic', 'Romantic', 'Upscale']",
       "['Casual', 'Cozy', 'Historic

In [None]:
df_expanded_clean['Crowd'].unique()

array([nan, "['Groups', 'Tourists']", "['Family-friendly']",
       "['College students', 'Family-friendly']", "['Groups', 'Locals']",
       "['College students', 'Groups']", "['Groups']",
       "['College students', 'Family-friendly', 'Tourists']",
       "['Tourists']", "['Locals']", "['Family-friendly', 'Groups']",
       "['Family-friendly', 'Tourists']", "['College students']",
       "['Family-friendly', 'Groups', 'Tourists']",
       "['Groups', 'Locals', 'Tourists']",
       "['College students', 'Tourists']",
       "['College students', 'Groups', 'Tourists']",
       "['College students', 'Family-friendly', 'Groups']",
       "['College students', 'Locals', 'Tourists']",
       "['College students', 'Groups', 'Locals']",
       "['Locals', 'Tourists']", "['College students', 'Locals']",
       "['Groups', 'Tourists', 'University students']",
       "['Family friendly', 'Tourists', 'University students']",
       "['Family friendly', 'Groups', 'Tourists', 'University student

In [57]:
columns_to_drop = ['state', 'address', 'Service options', 'Accessibility', 'Offerings', 'Highlights', 'From the business', 'Health and safety', 'Health & safety', 'Planning', 'Payments', 'Amenities']
df_processed = df_expanded_clean.drop(columns=columns_to_drop)

In [None]:
df_processed.isna().sum()

user_id                0
rating                 0
text                   0
gmap_id                0
name_y                 0
latitude               0
longitude              0
category               0
avg_rating             0
num_of_reviews         0
price             228737
Dining options    373764
Popular for       381433
Atmosphere        338579
Crowd             361848
dtype: int64

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

Since our dataset is large enough, for the remaining missing values, we decided to just simple drop the rows with missing `Popular for`, `Dining options`, and `price`.

In [58]:
df_processed2 = df_processed.dropna(subset=['Popular for', 'Dining options', 'price'])
df_processed2.isna().sum()

user_id               0
rating                0
text                  0
name_y                0
latitude              0
longitude             0
category              0
avg_rating            0
num_of_reviews        0
price                 0
hours             29864
Dining options        0
Popular for           0
Atmosphere         1279
Crowd             39433
dtype: int64

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

We can encode the missing values in `Atmosphere` as 'unknown', and the ones in `Crowd` as 'any'.

We will also need to process the representating of `price`. We decided to encode the `$` symbols as numbers, so `$` = 1, `$$` = 2, an so on.


In [59]:
# Replace NA values in 'Atmosphere' column with 'unknown'
df_processed2.loc[:, 'Atmosphere'] = df_processed2['Atmosphere'].fillna('unknown')

# Replace NA values in 'Crowd' column with 'any'
df_processed2.loc[:, 'Crowd'] = df_processed2['Crowd'].fillna('any')

In [None]:
df_processed2.isna().sum()

user_id           0
rating            0
text              0
gmap_id           0
name_y            0
latitude          0
longitude         0
category          0
avg_rating        0
num_of_reviews    0
price             0
Dining options    0
Popular for       0
Atmosphere        0
Crowd             0
dtype: int64

In [60]:
def encode_price(price):
    if pd.isna(price):
        return None  # Use np.nan 
    price = price.replace('₩', '$')  # Normalize '₩' to '$'
    return len(price)  # The number of '$' symbols corresponds to the price level


In [61]:
df_processed2.loc[:, 'price'] = df_processed2['price'].apply(encode_price)

In [62]:
df_processed2.head()

Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd
48,108514047127289837883,5.0,Amazing food and T is an amazing man.,Three Star Pizza,42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,2,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","[Breakfast, Lunch, Dinner, Dessert]","[Breakfast, Lunch, Dinner, Solo dining]","[Casual, Cozy]",any
50,100299238448263786400,3.0,I didn't actually try it but just felt like gi...,Three Star Pizza,42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,2,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","[Breakfast, Lunch, Dinner, Dessert]","[Breakfast, Lunch, Dinner, Solo dining]","[Casual, Cozy]",any
52,117632307472691068998,5.0,I am a huge fan of their sandwiches! Made to o...,Three Star Pizza,42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,2,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","[Breakfast, Lunch, Dinner, Dessert]","[Breakfast, Lunch, Dinner, Solo dining]","[Casual, Cozy]",any
54,100627839291727776043,2.0,There's a reason why it's not called 5 star pi...,Three Star Pizza,42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,2,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","[Breakfast, Lunch, Dinner, Dessert]","[Breakfast, Lunch, Dinner, Solo dining]","[Casual, Cozy]",any
56,113843275698867858889,4.0,Drunk and starving at 1am? They're open!,Three Star Pizza,42.559072,-70.881542,"[Pizza restaurant, Italian restaurant, Deliver...",3.9,48.0,2,"[[Thursday, 11AM–8PM], [Friday, 11AM–10PM], [S...","[Breakfast, Lunch, Dinner, Dessert]","[Breakfast, Lunch, Dinner, Solo dining]","[Casual, Cozy]",any


In [63]:
df_processed2.to_csv('data/cleaned_data.csv')

In [64]:
df_final = pd.read_csv('data/cleaned_data.csv')

# 2. EDA

## Exploring the distribution of ratings

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* Ratings are on a 1 to 5 scale.
* The majority of reviews are positive, with 77870 reviews rated 5.
* However, we note that the second most common rating is 1 at 9435 reviews. This suggests that people leave reviews either when they have a highly good or highly bad experience at a restaurant. 

In [None]:
# Distribution of ratings: To understand the overall sentiment towards the businesses.

plt.hist(df['rating'], bins = 5, edgecolor = 'black')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
# show exact counts
for i in range(1, 6):
    plt.text(i + 0.5 if i == 1 else i + 0.3 if i == 2 else i + 0.1 if i == 3 else i if i == 4 else i - 0.2, len(df[df['rating'] == i]), str(len(df[df['rating'] == i])), ha='right', va='bottom')
# change x ticks to read 1, 2, 3, 4, 5
plt.xticks(np.arange(1, 6, 1))

plt.show()


## Exploring review counts per local business

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* Most businesses have 10-20 reviews, with the number of businesses decreasing at a decreasing rate as the number of reviews increases.
* The mean number of reviews is 20.5 with a standard deviation of 40.53.
* This suggests that a small number of businesses might be overrepresented in our dataset, especially the one outlying businesses with 1292 reviews.

In [None]:
# Count of reviews per business - To see which businesses have been reviewed the most.

# describe the count of reviews per business
print("Summary statistics for count of reviews per business:")
print(df['gmap_id'].value_counts().describe())

df['gmap_id'].value_counts().plot(kind='hist', bins=100, edgecolor='black')
plt.title('Count of Reviews per Business')
plt.xlabel('Number of Reviews')
plt.ylabel('Count of Businesses')
plt.show()




## Exploring review lengths (in terms of number of characters)

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* We see that review lengths are distributed similarly to business ratings, with a large number of reviews having under 100 characters, and with the number of reviews decreasing at a decreasing rate as the review length increases.
* The mean review length is 167.47, but with a large standard deviation of 273.39.

In [None]:
# Review length analysis: To see the distribution of the length of the review texts.

df['review_length'] = df['text'].apply(lambda x: len(x) if x is not None else 0)

# describe the review length
print("Summary statistics for review length:")
print(df['review_length'].describe())

plt.hist(df['review_length'], bins=100, edgecolor='black')
plt.title('Distribution of Review Length')
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.show()



## Exploring correlation between review length and rating

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* We analyzed the correlation between the length of the review text and the rating. The correlation coefficient is around -0.156, which suggests that there is only a weak negative correlation between the length of the review and the rating.
* This result is consistent with our intuition, as the length of a review does not necessarily indicate its quality or sentiment.
* However, perhaps some angrier customers might leave longer reviews, which could explain the slight negative correlation. 
* We also see that reviews with a rating of 3 have a relatively lower review length, which supports the overlying notion that people write more/longer reviews when they feel strongly about a restaurant. 

In [None]:
# Correlation between review length and rating: To see if there is a correlation between the length of the review and the rating given.

# plot the correlation between review length and rating
sns.scatterplot(x='rating', y='review_length', data=df)
plt.title('Correlation between Review Length and Rating')
plt.xlabel('Rating')
plt.ylabel('Review Length')
plt.xticks(np.arange(1, 6, 1))
plt.show()

correlation_matrix = df[['rating', 'review_length']].corr()
print("Correlation matrix:")
print(correlation_matrix)

## Number of unique authors

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* We see that we have 77593 unique authors for our 100,000 reviews, which suggests that most authors have only left one review.
* This is confirmed by the mean number of reviews per author, which is 1.3 with a standard deviation of 0.88.
* In fact, we see through the summary statistics that over 75% of authors have only left 1 review. The most number of reviews left by one author is 62. 
* The shape of the distribution is once again similar to the previous two, with the number of authors decreasing at a decreasing rate as the number of reviews per author increases.

In [None]:
# How many unique authors?

print("Number of unique authors: ", df['name'].nunique())

# describe the count of ratings per author
print("Summary statistics for number of ratings per author:")
print(df['name'].value_counts().describe())

df['name'].value_counts().plot(kind='hist', bins=100, edgecolor='black')
plt.title('Count of Ratings per Author')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Authors')
plt.show()


## Simple sentiment analysis - Baseline Model

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* We used the `TextBlob` library to perform a simple sentiment analysis on the review text.
* First, we dropped reviews with no text, as they would not provide any information for sentiment analysis.
* We then calculated the polarity of each review, which ranges from -1 (most negative) to 1 (most positive).
* Plotting a histogram and summary statistics, we see that most reviews are moderately positive with a mean polarity of 0.36 and a standard deviation of 0.31. This is consistent with our earlier observation that most ratings are positive (5 stars).
* The distribution is roughly bell-shaped, but with a density spike at 0 and a few more spikes above 0.5. 


In [None]:
# Simple sentiment analysis on review text

# Add a column to the dataframe with the sentiment of the review
df_dropped = df.dropna(subset=['text'])
df_dropped['sentiment'] = df_dropped['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# print summary statistics
print("Summary statistics for sentiment:")
print(df_dropped['sentiment'].describe())

# plot the distribution of sentiment
plt.hist(df_dropped['sentiment'], bins=100, edgecolor='black')
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

There is a moderate positive correlation between sentiment score and rating, with a correlation coefficient of 0.547. This suggests that reviews with higher ratings tend to have more positive sentiment scores, which is expected. This is a good result for our training data.

We were worried that meaningful sentiment might not be extracted from short reviews, but the correlation suggests that the sentiment analysis is capturing the sentiment of the reviews decently well. Yay!

In [None]:
# Find correlation between sentiment and rating
correlation_matrix = df_dropped[['rating', 'sentiment']].corr()
print("Correlation matrix:")
print(correlation_matrix)

## Summary of findings

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

* We see that the majority of ratings given by Google users are 5 out of 5. This aligns with the Google text review sentiment analysis, which found that the majority of the reviews had a postive sentiment.
* We also found that there is a weak correlation between the length of a text review and the rating given by the same Google user. However, it could be helpful to note that based on the visualization displayed above, the review length was longest for either 1 or 5 star reviews.
* In the sentiment analysis, we found that majority of the text reviews result in a sentiment score near 0, which suggests that many reviews are neutral. The second most common sentiment scores are those near 1, which suggests that there are also many (but not as many) reviews that are highly positive. 
* This summarizes the distributional features of our reviews in their length, sentiment and the related score.

## Revised Project Question

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

Creating a restaurant recommendation system based on textual reviews and and user-inputted prompts.


## Implementation Plan -- Draft Pipeline

<div style="background-color:#3F7FBF; color:white; padding:10px"> 

The structure of our problem is similar to that of designing a recommender system. We describe an initial approach that comes to mind:

**Recommender Model Pipeline:**
- **Training step** (preprocessing): Embed the reviews for all restaurants as vectors using sentence embeddings (BERT), or a combined approach using BERT and other feature engineered features from the restaurant information that we have. One approach is then to cluster restaurants in an unsupervised way in the feature space using an approach like KNN or t-SNE. Perhaps we can do this in an informed way, however.
- **Interaction step:** Then, we may take a user prompt of sentence length describing the restaurant type they are looking for, and use the fine-tuned BERT pipeline to get a vector representation of the prompt.
- **Matching step:** Then we can take the top recommendations as the closest three or so restaurants in the feature space to the prompt. We can incorporate additional heuristics in this matching step.
  

**Potential Issues:**
- Reviews and prompts are semantically different. We might encounter unexpected relationships between the restaurants closest in the feature space based on reviews and the inputted prompts.
- Perhaps a content-based filtering approach is more directly applicable as a pipeline (ref: https://developers.google.com/machine-learning/recommendation/content-based/basics).
- With an unsupervised approach like this, we have no systematized way to measure whether our recommending algorithm is doing a good job at matching prompts to restaurants. But this is the nature of trying a recommendation system rather than a predictive model. We do not have the resources to conduct proper A/B testing, which would otherwise be a canonical solution.
-  A collaborative filtering approach would be great, but is infeasible since we cannot recruit enough users. 

# 3. Pipeline and Baseline Model:

#### preprocessing:

Note if you're trying to make this run. Use the cleaned_data 732MB csv file. Change the filepath below. Then it should run all in 15s.

In [None]:
# Load the data
with open('data/cleaned_data.csv', 'r') as f:
    data = f.readlines()

# Convert to dataframe
data_json_str = "[" + ','.join(data) + "]"
df = pd.read_csv(StringIO(data_json_str))

  df = pd.read_csv(StringIO(data_json_str))


In [67]:
df = df_final

In [68]:
df.dtypes

Unnamed: 0          int64
user_id            object
rating            float64
text               object
name_y             object
latitude          float64
longitude         float64
category           object
avg_rating        float64
num_of_reviews    float64
price               int64
hours              object
Dining options     object
Popular for        object
Atmosphere         object
Crowd              object
dtype: object

In [70]:
### get 10000 restaurants dumb sample. Take pics out.

# Import the necessary libraries
# Set the random seed for reproducibility
np.random.seed(42)

# Group by 'name_y'
grouped = df.groupby('name_y')


# If you need to work with all entries for each 'gmap_id' and then select 1000 groups
## Option 1: Select the first 1000 unique restaurants
top_1000_restaurants = grouped.apply(lambda x: x.head(1)).sample(1000)

## Option 2: If you need to select based on the size of each group (e.g., most entries)
# This will sort groups by size and take the top 1000 groups
top_1000_restaurants = grouped.size().nlargest(500).index
selected_entries = df[df['name_y'].isin(top_1000_restaurants)]


## Option 3:
# Randomly sample 1000 unique restaurants
# Ensure you have unique restaurants first
unique_restaurants = df.drop_duplicates(subset='name_y')
n_0 = 100
if len(unique_restaurants) >= n_0:
    sampled_restaurants = unique_restaurants.sample(n=n_0, random_state=1)  # Use random_state for reproducibility
else:
    print("There are less than 1000 unique restaurants available.")
    sampled_restaurants = unique_restaurants  # Use all available if less than 1000

# Now, get all entries for these 1000 restaurants
selected_entries = df[df['name_y'].isin(sampled_restaurants['name_y'])]

# Print or inspect the result
display(selected_entries)

# we have 76k rows. This is workable.
df = selected_entries
# Preprocess to remove invalid coordinate restaurants
df = df[(df['latitude'] != 0) | (df['longitude'] != 0)]

  top_1000_restaurants = grouped.apply(lambda x: x.head(1)).sample(1000)


Unnamed: 0.1,Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd
2233,21354,108160856436575183918,3.0,I will start by saying that the burger was tas...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly']
2234,21355,115972281888483164475,5.0,DELICIOUS! Worth the money. Good fries. Amazin...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly']
2235,21356,101236849850641353071,5.0,This place is top-notch. I went for the first ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly']
2236,21357,116422988597523226216,5.0,I love this place. The burgers are great and ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly']
2237,21358,117061396340108203379,4.0,"I had the Cuban Burger, to sum it up in one wo...",Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549779,4115309,101983922195533626676,5.0,(Translated by Google) I loved\n\n(Original)\n...,Joe's American Bar & Grill,42.503186,-71.131093,"['American restaurant', 'Bar', 'Bar & grill']",4.2,735.0,2,,"['Dinner', 'Dessert']",['Dinner'],"['Cozy', 'Romantic', 'Upscale']","['Groups', 'Tourists']"
1549780,4115310,116244827594469219251,3.0,(Translated by Google) Nice atmosphere but the...,Joe's American Bar & Grill,42.503186,-71.131093,"['American restaurant', 'Bar', 'Bar & grill']",4.2,735.0,2,,"['Dinner', 'Dessert']",['Dinner'],"['Cozy', 'Romantic', 'Upscale']","['Groups', 'Tourists']"
1549781,4115311,112196962788894053467,5.0,(Translated by Google) Our suver was great\n\n...,Joe's American Bar & Grill,42.503186,-71.131093,"['American restaurant', 'Bar', 'Bar & grill']",4.2,735.0,2,,"['Dinner', 'Dessert']",['Dinner'],"['Cozy', 'Romantic', 'Upscale']","['Groups', 'Tourists']"
1549782,4115312,115973068842303275364,5.0,(Translated by Google) Good food\n\n(Original)...,Joe's American Bar & Grill,42.503186,-71.131093,"['American restaurant', 'Bar', 'Bar & grill']",4.2,735.0,2,,"['Dinner', 'Dessert']",['Dinner'],"['Cozy', 'Romantic', 'Upscale']","['Groups', 'Tourists']"


### Stage 1: filter by basic critiria

In [None]:
!pip install geocoder
!pip install geopy

Collecting geocoder
  Obtaining dependency information for geocoder from https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl.metadata
  Downloading geocoder-1.38.1-py2.py3-none-any.whl.metadata (14 kB)
Collecting future (from geocoder)
  Obtaining dependency information for future from https://files.pythonhosted.org/packages/da/71/ae30dadffc90b9006d77af76b393cb9dfbfc9629f339fc1574a1c52e6806/future-1.0.0-py3-none-any.whl.metadata
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting ratelim (from geocoder)
  Obtaining dependency information for ratelim from https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl.metadata
  Downloading ratelim-0.1.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting requests (from geocoder)
  Obtaining dependency information for requests from https://files.pytho

In [71]:
import geocoder
import pandas as pd
from geopy.distance import geodesic

# Get user coordinates
def get_user_coordinates():
    g = geocoder.ip('me')
    if g.latlng:
        return g.latlng
    else:
        return None

coordinates = get_user_coordinates()
if coordinates:
    user_latitude, user_longitude = coordinates
    print(f"User's coordinates: Latitude={user_latitude}, Longitude={user_longitude}")
else:
    print("Unable to retrieve user's coordinates.")
    # Define default values or handle lack of coordinates appropriately
    # user_latitude, user_longitude = default_latitude, default_longitude


# Function to calculate distance using geopy
def calculate_distance(row, user_lat, user_lon):
    user_location = (user_lat, user_lon)
    business_location = (row['latitude'], row['longitude'])
    return geodesic(user_location, business_location).miles

# Apply the distance function
df['distance'] = df.apply(lambda row: calculate_distance(row, user_latitude, user_longitude), axis=1)
df_filtered = df[df['distance'] <= 10]  # Filter data within 10 miles

# Print the filtered DataFrame
print(df_filtered)


User's coordinates: Latitude=42.3584, Longitude=-71.0598
         Unnamed: 0                user_id  rating  \
3391          29155  107365174961044408492     2.0   
3392          29156  108298418347807329808     4.0   
3393          29157  107727238845490088431     5.0   
3394          29158  107400587067312679158     1.0   
8409          60138  104778186229216209444     4.0   
...             ...                    ...     ...   
1512453     4029779  116170118950619028953     3.0   
1512454     4029780  106682844150448128352     5.0   
1512455     4029781  113802845225626516525     5.0   
1512456     4029782  111183656943575415377     5.0   
1512457     4029783  111299023453800564024     4.0   

                                                      text             name_y  \
3391     The curry is soup and the Samosas are too expe...            Kashish   
3392                                 It was the Vidalooooo            Kashish   
3393                                        Excelle

Only 5k reviews left!

Issue to be adressed in final model: for 1.6M reviews, this search-method is intractable.

In [None]:
df_filtered

Unnamed: 0,[,user_id,rating,text,gmap_id,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,Dining options,Popular for,Atmosphere,Crowd,distance
,93550.0,1.078742e+20,4.0,The food is good I love the vegetable fried ri...,0x89e3798f3b7ca959:0x4631aaee8e823742,Brigham Circle Chinese Food,42.333779,-71.10516,"['Chinese restaurant', 'Asian restaurant', 'Me...",3.6,187.0,1.0,"['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cosy']","['Family friendly', 'Groups', 'Tourists']",2.852148
,93551.0,1.153763e+20,4.0,"The food was very delicious and fresh, I reall...",0x89e3798f3b7ca959:0x4631aaee8e823742,Brigham Circle Chinese Food,42.333779,-71.10516,"['Chinese restaurant', 'Asian restaurant', 'Me...",3.6,187.0,1.0,"['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cosy']","['Family friendly', 'Groups', 'Tourists']",2.852148
,93552.0,1.001856e+20,4.0,"I am a huge fan of this place, but when I orde...",0x89e3798f3b7ca959:0x4631aaee8e823742,Brigham Circle Chinese Food,42.333779,-71.10516,"['Chinese restaurant', 'Asian restaurant', 'Me...",3.6,187.0,1.0,"['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cosy']","['Family friendly', 'Groups', 'Tourists']",2.852148
,93553.0,1.104180e+20,5.0,This place is amazing! Stayed open late just t...,0x89e3798f3b7ca959:0x4631aaee8e823742,Brigham Circle Chinese Food,42.333779,-71.10516,"['Chinese restaurant', 'Asian restaurant', 'Me...",3.6,187.0,1.0,"['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cosy']","['Family friendly', 'Groups', 'Tourists']",2.852148
,93554.0,1.168564e+20,5.0,I haven't had Chinese food in a year 🥰,0x89e3798f3b7ca959:0x4631aaee8e823742,Brigham Circle Chinese Food,42.333779,-71.10516,"['Chinese restaurant', 'Asian restaurant', 'Me...",3.6,187.0,1.0,"['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cosy']","['Family friendly', 'Groups', 'Tourists']",2.852148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,2446643.0,1.155974e+20,4.0,Best BBQ available in Boston area.,0x89e382fa18a63ba5:0x711f6b660dd561ed,Blue Ribbon BBQ,42.348705,-71.22937,"['Barbecue restaurant', 'Restaurant']",4.4,643.0,2.0,"['Lunch', 'Dinner', 'Catering', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Groups', 'Tourists']",6.592319
,2446644.0,1.078629e+20,4.0,Good bbq.,0x89e382fa18a63ba5:0x711f6b660dd561ed,Blue Ribbon BBQ,42.348705,-71.22937,"['Barbecue restaurant', 'Restaurant']",4.4,643.0,2.0,"['Lunch', 'Dinner', 'Catering', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Groups', 'Tourists']",6.592319
,2446645.0,1.107969e+20,5.0,Barbq to die for,0x89e382fa18a63ba5:0x711f6b660dd561ed,Blue Ribbon BBQ,42.348705,-71.22937,"['Barbecue restaurant', 'Restaurant']",4.4,643.0,2.0,"['Lunch', 'Dinner', 'Catering', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Groups', 'Tourists']",6.592319
,2446646.0,1.142200e+20,5.0,Great Pulled Pork BBQ!,0x89e382fa18a63ba5:0x711f6b660dd561ed,Blue Ribbon BBQ,42.348705,-71.22937,"['Barbecue restaurant', 'Restaurant']",4.4,643.0,2.0,"['Lunch', 'Dinner', 'Catering', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Groups', 'Tourists']",6.592319


In [77]:
# inspect hours column
# df reset index
df.reset_index(drop=True, inplace=True)
df.head(10)
# drop column "Unnamed: 0"
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head(10)




Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd,distance
0,108160856436575183918,3.0,I will start by saying that the burger was tas...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
1,115972281888483164475,5.0,DELICIOUS! Worth the money. Good fries. Amazin...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
2,101236849850641353071,5.0,This place is top-notch. I went for the first ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
3,116422988597523226216,5.0,I love this place. The burgers are great and ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
4,117061396340108203379,4.0,"I had the Cuban Burger, to sum it up in one wo...",Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
5,113617611739630621923,4.0,I have eaten here often and the burgers are th...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
6,108966678917026202298,5.0,"I'm addicted to this place, amazing burgers! F...",Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
7,101498284754899538734,1.0,Felt sick after eating here. The food was def...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
8,114633309734763732646,1.0,It's not well managed. They did not have 3 of ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684
9,106346653732339052795,1.0,I've eaten here before and considering the bur...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684


In [82]:
!pip install python-dateutil


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [105]:
### filter by opening hour -- DHATI

import pandas as pd
from datetime import datetime
import pytz 
from dateutil import parser



def is_open(current_time, hours_str):
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    current_day = days[current_time.weekday()]
    for day_segment in eval(hours_str):
        day, hours = day_segment[0], day_segment[1]
        print(day, hours)
        print(hours.split('–'))
        if day == current_day and hours != 'Closed':
            open_time, close_time = hours.split('–')
            open_time = parser.parse(open_time).time()
            close_time = parser.parse(close_time).time()
            if current_time.time() >= open_time and current_time.time() <= close_time:
                return True
    return False

user_timezone = pytz.timezone('America/New_York') 
current_time = datetime.now(user_timezone)

# Apply the function to filter DataFrame
df['is_open'] = df['hours'].apply(lambda x: is_open(current_time, x) if pd.notna(x) else False)


df_open_now = df[df['is_open']]


df_open_now

Thursday 10:30AM–9PM
['10:30AM', '9PM']
Friday 10:30AM–9PM
['10:30AM', '9PM']
Saturday 10:30AM–9PM
['10:30AM', '9PM']
Sunday 11AM–9PM
['11AM', '9PM']
Monday 10:30AM–9PM
['10:30AM', '9PM']
Tuesday 10:30AM–9PM
['10:30AM', '9PM']
Wednesday 10:30AM–9PM
['10:30AM', '9PM']
Thursday 10:30AM–9PM
['10:30AM', '9PM']
Friday 10:30AM–9PM
['10:30AM', '9PM']
Saturday 10:30AM–9PM
['10:30AM', '9PM']
Sunday 11AM–9PM
['11AM', '9PM']
Monday 10:30AM–9PM
['10:30AM', '9PM']
Tuesday 10:30AM–9PM
['10:30AM', '9PM']
Wednesday 10:30AM–9PM
['10:30AM', '9PM']
Thursday 10:30AM–9PM
['10:30AM', '9PM']
Friday 10:30AM–9PM
['10:30AM', '9PM']
Saturday 10:30AM–9PM
['10:30AM', '9PM']
Sunday 11AM–9PM
['11AM', '9PM']
Monday 10:30AM–9PM
['10:30AM', '9PM']
Tuesday 10:30AM–9PM
['10:30AM', '9PM']
Wednesday 10:30AM–9PM
['10:30AM', '9PM']
Thursday 10:30AM–9PM
['10:30AM', '9PM']
Friday 10:30AM–9PM
['10:30AM', '9PM']
Saturday 10:30AM–9PM
['10:30AM', '9PM']
Sunday 11AM–9PM
['11AM', '9PM']
Monday 10:30AM–9PM
['10:30AM', '9PM']
Tuesday 

Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd,distance,is_open,sentiment
0,108160856436575183918,3.0,I will start by saying that the burger was tas...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684,True,0.290476
1,115972281888483164475,5.0,DELICIOUS! Worth the money. Good fries. Amazin...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684,True,0.645000
2,101236849850641353071,5.0,This place is top-notch. I went for the first ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684,True,0.307639
3,116422988597523226216,5.0,I love this place. The burgers are great and ...,Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684,True,0.432143
4,117061396340108203379,4.0,"I had the Cuban Burger, to sum it up in one wo...",Wayback Burgers,42.136791,-72.568911,"['Hamburger restaurant', 'American restaurant']",3.7,28.0,1,"[['Thursday', '10:30AM–9PM'], ['Friday', '10:3...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],['Family-friendly'],78.884684,True,0.162891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18671,113716905572389069483,5.0,Awesome grinders at a great price!,Paisano's Pizza Restaurant & Pub,42.238250,-72.720664,['Pizza restaurant'],4.3,478.0,2,"[['Sunday', '11AM–9PM'], ['Monday', '11AM–9:30...","['Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",85.504237,True,1.000000
18672,111306903443904732724,1.0,Food is awesome,Paisano's Pizza Restaurant & Pub,42.238250,-72.720664,['Pizza restaurant'],4.3,478.0,2,"[['Sunday', '11AM–9PM'], ['Monday', '11AM–9:30...","['Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",85.504237,True,1.000000
18673,109854958835353138820,5.0,Price and amount of food is super,Paisano's Pizza Restaurant & Pub,42.238250,-72.720664,['Pizza restaurant'],4.3,478.0,2,"[['Sunday', '11AM–9PM'], ['Monday', '11AM–9:30...","['Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",85.504237,True,0.333333
18674,115238258752485956356,5.0,I got the alfredo and it was fantastic!,Paisano's Pizza Restaurant & Pub,42.238250,-72.720664,['Pizza restaurant'],4.3,478.0,2,"[['Sunday', '11AM–9PM'], ['Monday', '11AM–9:30...","['Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",85.504237,True,0.500000


In [107]:
### lookup dish: DHATI, JANICE

# get dish from user as a prompt

import pandas as pd
import re

# Function to create a regex pattern for the dish to handle variations
def create_dish_pattern(dish_name):
    words = dish_name.split()
    if len(words) > 1:
        # Create patterns to match both possible word orders
        pattern1 = r'\b' + r'\b.*\b'.join(map(re.escape, words)) + r'\b'
        pattern2 = r'\b' + r'\b.*\b'.join(map(re.escape, reversed(words))) + r'\b'
        return re.compile(pattern1 + '|' + pattern2, re.IGNORECASE)
    else:
        return re.compile(r'\b' + re.escape(dish_name) + r'\b', re.IGNORECASE)

# Function to find reviews that mention the dish using the regex pattern
def find_dish_reviews(df, pattern):
    matches = df['text'].str.contains(pattern, na=False)
    return df[matches]

# Get dish name from user
dish_name = input("Enter the name of the dish you are looking for: ")
pattern = create_dish_pattern(dish_name)

# Find reviews mentioning the dish
dish_reviews = find_dish_reviews(df, pattern)


print(dish_reviews[['name_y', 'text']])

# go restaurant by restaurant, then dish by dish, and look all reviews that mention prompt (my best guess is use regex, but think intelligently. pasta alfredo vs alfredo pasta)

                                 name_y  \
103               Skipper Chowder House   
158               Skipper Chowder House   
321               Skipper Chowder House   
486                  Beech Tree Cantina   
806                D & A House of Pizza   
...                                 ...   
18545  Paisano's Pizza Restaurant & Pub   
18572  Paisano's Pizza Restaurant & Pub   
18690        Joe's American Bar & Grill   
18716        Joe's American Bar & Grill   
18734        Joe's American Bar & Grill   

                                                    text  
103    We had an absolutely lovely meal at the Skippe...  
158    Great food and if you are gluten free, they ar...  
321    We went at peak time.  It was very busy.  Didn...  
486    So, me and my girlfriend went in and ordered m...  
806    Their Grilled chicken is soooo good! Meatball ...  
...                                                  ...  
18545  Had the pasta padavana with shells.  It was ph...  
18572  Gene

### Stage 2: rank by sentiment for the dish - SUSANNAH, ETHAN.

In [118]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

def get_weighted_index(row):
    sentiment = get_sentiment(row['text'])
    avg_rating = row['rating']
    return 0.8 * sentiment + 0.2 * avg_rating

def get_top_restaurants(df, n):
    df['weighted_index'] = df.apply(get_weighted_index, axis=1)
    return df.sort_values('weighted_index', ascending=False).head(n)

get_top_restaurants(df, 10)


Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd,distance,is_open,sentiment,weighted_index
16120,110329840757359258292,5.0,"The best tuna in town! Also, spicy fries here ...",Ma Magoo's,42.388012,-71.142401,"['Pizza restaurant', 'Seafood restaurant']",4.5,747.0,1,"[['Monday', '11AM–9PM'], ['Tuesday', '11AM–9PM...","['Lunch', 'Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],"['College students', 'Family-friendly', 'Group...",4.695637,True,1.0,1.8
7657,115564617266123280402,5.0,Best place ever!,Dalat Restaurant,42.25625,-71.823575,['Vietnamese restaurant'],4.7,294.0,1,"[['Monday', '11AM–8:30PM'], ['Tuesday', '11AM–...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy', 'Historic']","['Groups', 'Tourists']",39.760164,True,1.0,1.8
16907,117830470806722775132,5.0,Excellent food and service! Can't wait to re...,Red's Kitchen + Tavern,42.535435,-70.988811,"['Family restaurant', 'Bar', 'Breakfast restau...",4.4,1849.0,2,"[['Friday', '7AM–10PM'], ['Saturday', '7AM–10P...","['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['Family-friendly', 'Groups', 'Tourists']",12.746936,True,1.0,1.8
5093,108747575562429978376,5.0,Excellent service & food.,Barrels and Boards,41.962909,-71.067442,['Restaurant'],4.4,388.0,2,"[['Saturday', '11:30AM–9:30PM'], ['Sunday', '1...","['Lunch', 'Dinner', 'Catering', 'Dessert', 'Se...","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",27.299476,True,1.0,1.8
1052,110202692407134189980,5.0,Best pizza in town,Londi's,42.530421,-71.206257,"['Pizza restaurant', 'Italian restaurant', 'De...",4.3,108.0,1,"[['Tuesday', '11AM–9PM'], ['Wednesday', '11AM–...","['Lunch', 'Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Groups', 'Tourists']",14.036893,True,1.0,1.8
7629,107027563751179404037,5.0,Been coming here for years and it's always de...,Dalat Restaurant,42.25625,-71.823575,['Vietnamese restaurant'],4.7,294.0,1,"[['Monday', '11AM–8:30PM'], ['Tuesday', '11AM–...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy', 'Historic']","['Groups', 'Tourists']",39.760164,True,1.0,1.8
5089,103901666484836191758,5.0,Excellent place,Barrels and Boards,41.962909,-71.067442,['Restaurant'],4.4,388.0,2,"[['Saturday', '11:30AM–9:30PM'], ['Sunday', '1...","['Lunch', 'Dinner', 'Catering', 'Dessert', 'Se...","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",27.299476,True,1.0,1.8
7634,112307488018210260036,5.0,Excellent Pho place,Dalat Restaurant,42.25625,-71.823575,['Vietnamese restaurant'],4.7,294.0,1,"[['Monday', '11AM–8:30PM'], ['Tuesday', '11AM–...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy', 'Historic']","['Groups', 'Tourists']",39.760164,True,1.0,1.8
7635,114617476713029527958,5.0,Excellent vietnamese food. Rest people.,Dalat Restaurant,42.25625,-71.823575,['Vietnamese restaurant'],4.7,294.0,1,"[['Monday', '11AM–8:30PM'], ['Tuesday', '11AM–...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy', 'Historic']","['Groups', 'Tourists']",39.760164,True,1.0,1.8
5087,106276310004805760183,5.0,Wonderful,Barrels and Boards,41.962909,-71.067442,['Restaurant'],4.4,388.0,2,"[['Saturday', '11:30AM–9:30PM'], ['Sunday', '1...","['Lunch', 'Dinner', 'Catering', 'Dessert', 'Se...","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",27.299476,True,1.0,1.8


#### Combine all the above functions into a pipeline

In [120]:
def get_recommendations(df, dish_name, n):
    df_open_now = df[df['is_open']]
    pattern = create_dish_pattern(dish_name)
    dish_reviews = find_dish_reviews(df_open_now, pattern)
    return get_top_restaurants(dish_reviews, n)

dish_name = "pasta"
n = 10

get_recommendations(df, dish_name, n)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weighted_index'] = df.apply(get_weighted_index, axis=1)


Unnamed: 0,user_id,rating,text,name_y,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,Dining options,Popular for,Atmosphere,Crowd,distance,is_open,sentiment,weighted_index
2372,111523147644606987883,5.0,Excellent salad and pasta,Real Italian Gusto,42.418367,-71.11053,"['Italian restaurant', 'Pizza restaurant', 'Re...",4.5,298.0,2,"[['Tuesday', '4–9:30PM'], ['Wednesday', '4–9:3...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['Groups', 'Tourists']",4.885643,True,1.0,1.8
2343,111523645600054497504,5.0,Delicious pasta and pizza! Yummy cannoli!,Real Italian Gusto,42.418367,-71.11053,"['Italian restaurant', 'Pizza restaurant', 'Re...",4.5,298.0,2,"[['Tuesday', '4–9:30PM'], ['Wednesday', '4–9:3...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['Groups', 'Tourists']",4.885643,True,1.0,1.8
10983,102394399621420082113,5.0,The pasta was cooked perfect the veal parm was...,Via Italian Table,42.263258,-71.790047,"['Italian restaurant', 'Bar', 'Brunch restaura...",4.5,1258.0,2,"[['Sunday', '10:30AM–8PM'], ['Monday', '11:30A...","['Dinner', 'Dessert']","['Dinner', 'Solo dining']","['Casual', 'Cozy', 'Upscale']","['Groups', 'Tourists']",37.982255,True,0.95,1.76
2084,115459903312188149992,5.0,Great food and great service! The pasta marina...,Royal II Restaurant and Grill,41.70813,-70.211778,"['Mediterranean restaurant', 'American restaur...",4.7,468.0,2,"[['Tuesday', '11:30AM–9PM'], ['Wednesday', '11...",['Dessert'],"['Breakfast', 'Lunch', 'Solo dining']","['Casual', 'Cozy']","['Groups', 'Tourists']",62.594873,True,0.933333,1.746667
10966,102020125676324565176,5.0,"Great food, Great atmosphere, excellent servic...",Via Italian Table,42.263258,-71.790047,"['Italian restaurant', 'Bar', 'Brunch restaura...",4.5,1258.0,2,"[['Sunday', '10:30AM–8PM'], ['Monday', '11:30A...","['Dinner', 'Dessert']","['Dinner', 'Solo dining']","['Casual', 'Cozy', 'Upscale']","['Groups', 'Tourists']",37.982255,True,0.866667,1.693333
6923,107969651779868358614,5.0,"I think Walden Kitchen is the best. The food,...",Walden Italian Kitchen,42.457232,-71.395286,"['Italian restaurant', 'Restaurant']",4.2,118.0,1,"[['Tuesday', '12–8PM'], ['Wednesday', '12–8PM'...",['Dessert'],"['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']",['Groups'],18.466629,True,0.833333,1.666667
1167,103436946711155604013,5.0,Absolutely delicious food. We got a sandwich...,Ciao Bella,42.287301,-71.807065,"['Italian restaurant', 'Pizza restaurant']",4.4,434.0,2,"[['Monday', '11AM–8:30PM'], ['Tuesday', '11AM–...","['Lunch', 'Dinner', 'Dessert', 'Seating']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],"['Family-friendly', 'Groups', 'Tourists']",38.588117,True,0.8,1.64
8003,110521893061806722341,5.0,Great beer selection. All the food was delicio...,Nosh & Grog,42.187296,-71.305771,"['New American restaurant', 'Bar', 'Gastropub']",4.3,217.0,2,"[['Saturday', '4–9PM'], ['Sunday', '4–9PM'], [...","['Lunch', 'Dinner', 'Dessert']","['Lunch', 'Dinner', 'Solo dining']","['Casual', 'Cozy']","['Groups', 'Tourists']",17.275617,True,0.716667,1.573333
10737,106054547458195058145,5.0,Enjoyed a delicious lunch of California Pasta ...,Ruby Tuesday,41.921129,-71.35851,"['American restaurant', 'Bar', 'European resta...",3.7,364.0,2,"[['Sunday', '11AM–10PM'], ['Monday', '11AM–10P...","['Lunch', 'Dinner']","['Lunch', 'Dinner', 'Solo dining']",['Casual'],any,33.856805,True,0.666667,1.533333
2484,111930609364565752759,5.0,Fantastic meal for Valentine’s Day. The mushro...,Porto,42.34821,-71.080545,"['Italian restaurant', 'American restaurant', ...",4.3,208.0,3,"[['Monday', 'Closed'], ['Tuesday', '5–9PM'], [...","['Breakfast', 'Lunch', 'Dinner', 'Catering', '...","['Breakfast', 'Lunch', 'Dinner', 'Solo dining'...","['Casual', 'Cozy']","['College students', 'Family-friendly', 'Group...",1.273822,True,0.65,1.52
