In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
reviews = pd.read_csv("restaurant_reviews_sample.csv")

In [4]:
reviews.columns

Index(['review_id', 'user_id', 'business_id', 'stars_x', 'useful', 'funny',
       'cool', 'text', 'date', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars_y', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'is_restaurant'],
      dtype='object')

In [5]:
reviews["state"].value_counts()

state
PA    8017
LA    7562
TN    5765
FL    5751
MO    3794
NV    3008
AZ    2809
IN    2722
NJ    2339
CA    1933
AB     842
ID     799
DE     570
IL     534
NC      29
HI      19
CO      12
Name: count, dtype: int64

In [6]:
reviews["stars_x"].value_counts()

stars_x
5    19994
4    11379
1     5591
3     5485
2     4056
Name: count, dtype: int64

In [7]:
reviews["text"][0]

"Love going here for happy hour or dinner!  Great patio with fans to beat the StL heat!   Also...very accomodating at this location.  I like the Veal Milanese but with mixed greens instead of pasta!  they'll modify the menu to suit your taste!"

In [8]:
reviews["has_exclamation"] = reviews["text"].fillna("").str.contains("!")

In [9]:
reviews = reviews.drop(["useful", "funny", "cool", "address", "postal_code", "latitude", "longitude", "is_open", "stars_y"], axis=1)

In [10]:
reviews["has_question"] = reviews["text"].fillna("").str.contains("?", regex=False)

In [11]:
# Check if review has uppercase words aka shouting
reviews["uppercase_ratio"] = reviews["text"].fillna("").apply(
    lambda x: sum(c.isupper() for c in x) / max(1, len(x))
)
reviews["is_shouting"] = reviews["uppercase_ratio"] > 0.3

In [12]:
reviews[reviews["is_shouting"]]["text"].iloc[5]

"Don't order hot food here: former employee says they're instructed to PICK UP FOOD THAT FALLS ON THE FLOOR, RE-FRY OR GRILL IT, AND SERVE IT!!!!!  Staggering."

In [13]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,text,date,name,city,state,review_count,attributes,categories,hours,is_restaurant,has_exclamation,has_question,uppercase_ratio,is_shouting
0,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01,Brio Italian Grille,St Louis,MO,366,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Bars, Pizza, Nightlife, Cocktail Bars, Italian...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ...",True,True,False,0.033058,False
1,RGV9GWhAAfAAlYyd4vho7g,Zs8Zk3sgh5JxRmoZW4PJcg,3ZynJ94VpIdDlaArmEp2Rg,3,A couple friends and I stopped by for some lat...,2013-10-03 16:24:30,More Than Just Ice Cream,Philadelphia,PA,239,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Food, Sandwiches, Ice Cream & Frozen Yogurt, C...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",True,False,False,0.034527,False
2,zqmkEnp1kfU2vosDcG2kMg,KqKXOl0PMlZGBMlw8OUpyA,-If0ps0QhOLCYVWQWs9RYg,5,Yes! I love this place! Maple Street Patisseri...,2013-05-28 21:37:01,Maple Street Patisserie,New Orleans,LA,171,"{'Ambience': ""{'romantic': False, 'intimate': ...","Restaurants, Food, Sandwiches, Bakeries","{'Tuesday': '6:0-17:0', 'Wednesday': '6:0-17:0...",True,True,False,0.037915,False
3,bi6GaeWDGceGv62lXTIKQA,RgtbLaiU22zqaCk20HgbiQ,bjhCtlYHrkgA5Ku8l-rB3g,1,Very disappointed. We went to eat at 2:15 on ...,2015-01-04 02:26:46,Our Daily Bread,Santa Barbara,CA,106,"{'BikeParking': 'True', 'Caters': 'True', 'Alc...","Restaurants, Cafes, Breakfast & Brunch, Bakeri...","{'Monday': '0:0-0:0', 'Tuesday': '6:30-15:30',...",True,False,False,0.02507,False
4,TgDp1TErom3UNglKhDy2uw,zUB7xoTlhbg7_ofHg8Qp0w,W8Z4rXYkmZlEVSaxIVjyvg,5,A truly delightful find! This is the transform...,2017-08-16 16:44:52,Redwood Rotisserie + Grill,Reno,NV,464,"{'RestaurantsTakeOut': 'True', 'Caters': 'True...","Restaurants, American (New), Cocktail Bars, Ba...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-20:0', ...",True,True,False,0.017525,False


In [14]:
reviews["is_shouting"].value_counts()

is_shouting
False    46498
True         7
Name: count, dtype: int64

In [18]:
reviews["length"] = reviews["text"].fillna("").apply(len)
reviews["length"].describe()

count    46505.000000
mean       537.289754
std        493.258350
min         10.000000
25%        218.000000
50%        383.000000
75%        685.000000
max       4999.000000
Name: length, dtype: float64

In [19]:
from transformers import pipeline
from tqdm import tqdm

sentiment = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1,  # CPU,
    truncation=True,
    max_length=512
)

batch_size = 64
texts = reviews["text"].fillna("").tolist()
texts = sorted(texts, key=len)

results = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    try:
        batch_results = sentiment(batch)
    except Exception as e:
        print(f"Batch {i} failed: {e}")
        batch = [t[:512] for t in batch]
        batch_results = sentiment(batch)
    results.extend(batch_results)

Device set to use cpu
100%|██████████| 727/727 [2:19:13<00:00, 11.49s/it]   


In [63]:
reviews = pd.read_csv("restaurant_reviews_processed.csv")
reviews_old = pd.read_csv("restaurant_reviews_sample.csv")

In [21]:
from tqdm import tqdm
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore", message="Length of IterableDataset")


zero_shot = pipeline(
    "zero-shot-classification",
    model="typeform/distilbert-base-uncased-mnli",
    device=-1,
    truncation=True,
    max_length=512
)

labels = ["food", "service", "atmosphere"]

batch_size = 16
texts = reviews["text"].fillna("").tolist()
texts = sorted(texts, key=len)

results = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch= texts[i:i+batch_size]
    outputs = zero_shot(batch, candidate_labels=labels, multi_label=False)
    top_labels = [r["labels"][0] for r in outputs]
    results.extend(top_labels)

Device set to use cpu
100%|██████████| 2907/2907 [6:41:32<00:00,  8.29s/it]  


In [23]:
pd.DataFrame(results).to_csv("aspect.csv", index=False)

In [64]:
sentiments = reviews["sentiment"]

In [65]:
reviews_old = reviews_old.assign(text_len=reviews_old["text"].fillna("").str.len())
reviews_old = reviews_old.sort_values("text_len").reset_index(drop=True)
reviews_old["sentiment"] = sentiments
reviews_old["aspect"] = pd.read_csv("aspect.csv")

In [66]:
reviews_old.to_csv("restaurant_reviews_processed.csv", index=False)

In [54]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False
    
reviews_old = reviews_old[reviews_old["text"].fillna("").apply(is_english)]

In [59]:
reviews_old[reviews_old["aspect"] == "atmosphere"].head(10)

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,...,stars_y,review_count,is_open,attributes,categories,hours,is_restaurant,text_len,sentiment,aspect
4,dhdYxUpl3QdCRRl__UbrHw,Hn8O2RQijYIVLFNF5VPWTA,jDCk9DEwSS_TamliID4QuQ,4,1,1,1,Try the cuban,2010-09-02 14:30:59,Norma's Sweets Bakery,...,4.0,26,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Bakeries, Imported Food, Food, Ethnic Food, Sp...",,True,13,NEGATIVE,atmosphere
6,W72nmYSjE78uYwyEz4fvCg,9mM6R32PKGTPJ18GUZOofA,6kAXOzE7fqaBZINQV_-_mg,5,1,0,0,Best pho in the city,2010-09-06 21:04:01,Kien Giang,...,3.5,326,1,"{'BikeParking': 'False', 'RestaurantsTakeOut':...","Vietnamese, Restaurants","{'Tuesday': '11:0-20:30', 'Wednesday': '11:0-2...",True,20,POSITIVE,atmosphere
7,5Xo7PRMm3xHhKzr9deIddQ,9mM6R32PKGTPJ18GUZOofA,6kAXOzE7fqaBZINQV_-_mg,5,0,0,0,Best pho in Nashville.,2010-08-08 23:12:11,Kien Giang,...,3.5,326,1,"{'BikeParking': 'False', 'RestaurantsTakeOut':...","Vietnamese, Restaurants","{'Tuesday': '11:0-20:30', 'Wednesday': '11:0-2...",True,22,POSITIVE,atmosphere
9,oTCPxRS1OrRva__8srEsqA,H3ZfkYllVxBxtq_V-HfDQg,pP4q0Mym-qt20nRqTKbzhQ,4,0,0,0,just decent for this price.,2016-08-03 15:04:33,Morton's The Steakhouse,...,3.5,149,0,"{'WiFi': ""u'no'"", 'RestaurantsTakeOut': 'False...","Steakhouses, Restaurants, Nightlife, Seafood, ...","{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",True,27,POSITIVE,atmosphere
12,-zwftAZo5ne9f28Hz7luAw,zUiid-eUPLf6xHwRpH4KMg,jpmjCRNAlWBBf1_fuMhfaA,1,0,0,0,Su Ottavo is permanently closed.,2017-01-29 19:39:23,Restaurant Su Ottavo,...,3.5,23,0,"{'BusinessParking': ""{'garage': False, 'street...","Italian, Restaurants, Pizza","{'Tuesday': '16:0-22:0', 'Wednesday': '16:0-22...",True,32,NEGATIVE,atmosphere
16,dyz9L_6rxKH5In8n4UhVdg,uDCXh4P5c9vZcoUZU3jcLA,7YiLEuHuUONZrMPT0OxXUQ,3,0,0,0,Quick service. Smelly entrance-yuck!,2010-09-13 22:38:57,McDonald's,...,1.5,89,0,"{'RestaurantsReservations': 'False', 'Restaura...","Fast Food, Restaurants, Burgers, Coffee & Tea,...","{'Monday': '6:0-23:0', 'Tuesday': '6:0-23:0', ...",True,36,NEGATIVE,atmosphere
20,0IU4f57SNCHz8PgnYVluWg,_xa2i1FSYkM9C7b81SX9zA,wWLic8cT6apGZWK8wWWSQQ,5,0,0,0,Great food and creative casual dining,2019-02-10 14:36:43,Hearth Kitchen,...,3.5,97,1,"{'RestaurantsTableService': 'True', 'Restauran...","Restaurants, Italian, American (New), Pizza","{'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21...",True,37,POSITIVE,atmosphere
21,l5ewnhs1SZHG2rnuDj1sLQ,DEpdvClKTuYXcMam4ACiew,a7FSs8soBoxfkPXvzSsvbg,5,1,0,0,Best Mexican food in middle Tennessee!,2010-03-18 16:50:10,Garcias Mexican Restaurant,...,4.0,158,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Mexican, Restaurants, Steakhouses","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...",True,38,NEGATIVE,atmosphere
28,DBmo43QuruK1rVU7hNKlag,yhaxEW2nUboIPH4E9G8sbg,6kAXOzE7fqaBZINQV_-_mg,4,0,0,0,"Pho, spring and egg rolls. Pretty good.",2012-07-12 02:17:10,Kien Giang,...,3.5,326,1,"{'BikeParking': 'False', 'RestaurantsTakeOut':...","Vietnamese, Restaurants","{'Tuesday': '11:0-20:30', 'Wednesday': '11:0-2...",True,40,POSITIVE,atmosphere
30,laxLzwfuaKD9eRytg76U1g,diKaKMts7WxbQqxyRB8wmQ,lRPOodYgeRScvDDDO8_Qjg,2,0,0,0,Delivery took forever. Food was just okay.,2017-04-22 19:36:00,Tuttorosso Restaurant & Pizzeria,...,4.0,336,1,"{'RestaurantsTableService': 'True', 'Restauran...","Italian, Australian, Pizza, Bars, Wine Bars, S...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",True,42,POSITIVE,atmosphere
