In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
reviews = pd.read_csv("restaurant_reviews_sample.csv")

In [3]:
reviews.columns

Index(['review_id', 'user_id', 'business_id', 'stars_x', 'useful', 'funny',
       'cool', 'text', 'date', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars_y', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'is_restaurant'],
      dtype='object')

In [4]:
reviews["state"].value_counts()

state
PA    8017
LA    7562
TN    5765
FL    5751
MO    3794
NV    3008
AZ    2809
IN    2722
NJ    2339
CA    1933
AB     842
ID     799
DE     570
IL     534
NC      29
HI      19
CO      12
Name: count, dtype: int64

In [5]:
reviews["stars_x"].value_counts()

stars_x
5    19994
4    11379
1     5591
3     5485
2     4056
Name: count, dtype: int64

In [6]:
reviews["text"][0]

"Love going here for happy hour or dinner!  Great patio with fans to beat the StL heat!   Also...very accomodating at this location.  I like the Veal Milanese but with mixed greens instead of pasta!  they'll modify the menu to suit your taste!"

In [7]:
reviews["has_exclamation"] = reviews["text"].fillna("").str.contains("!")

In [8]:
reviews = reviews.drop(["useful", "funny", "cool", "address", "postal_code", "latitude", "longitude", "is_open", "stars_y"], axis=1)

In [9]:
reviews["has_question"] = reviews["text"].fillna("").str.contains("?", regex=False)

In [10]:
# Check if review has uppercase words aka shouting
reviews["uppercase_ratio"] = reviews["text"].fillna("").apply(
    lambda x: sum(c.isupper() for c in x) / max(1, len(x))
)
reviews["is_shouting"] = reviews["uppercase_ratio"] > 0.3

In [11]:
reviews[reviews["is_shouting"]]["text"].iloc[5]

"Don't order hot food here: former employee says they're instructed to PICK UP FOOD THAT FALLS ON THE FLOOR, RE-FRY OR GRILL IT, AND SERVE IT!!!!!  Staggering."

In [12]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,text,date,name,city,state,review_count,attributes,categories,hours,is_restaurant,has_exclamation,has_question,uppercase_ratio,is_shouting
0,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01,Brio Italian Grille,St Louis,MO,366,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Bars, Pizza, Nightlife, Cocktail Bars, Italian...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ...",True,True,False,0.033058,False
1,RGV9GWhAAfAAlYyd4vho7g,Zs8Zk3sgh5JxRmoZW4PJcg,3ZynJ94VpIdDlaArmEp2Rg,3,A couple friends and I stopped by for some lat...,2013-10-03 16:24:30,More Than Just Ice Cream,Philadelphia,PA,239,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Food, Sandwiches, Ice Cream & Frozen Yogurt, C...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",True,False,False,0.034527,False
2,zqmkEnp1kfU2vosDcG2kMg,KqKXOl0PMlZGBMlw8OUpyA,-If0ps0QhOLCYVWQWs9RYg,5,Yes! I love this place! Maple Street Patisseri...,2013-05-28 21:37:01,Maple Street Patisserie,New Orleans,LA,171,"{'Ambience': ""{'romantic': False, 'intimate': ...","Restaurants, Food, Sandwiches, Bakeries","{'Tuesday': '6:0-17:0', 'Wednesday': '6:0-17:0...",True,True,False,0.037915,False
3,bi6GaeWDGceGv62lXTIKQA,RgtbLaiU22zqaCk20HgbiQ,bjhCtlYHrkgA5Ku8l-rB3g,1,Very disappointed. We went to eat at 2:15 on ...,2015-01-04 02:26:46,Our Daily Bread,Santa Barbara,CA,106,"{'BikeParking': 'True', 'Caters': 'True', 'Alc...","Restaurants, Cafes, Breakfast & Brunch, Bakeri...","{'Monday': '0:0-0:0', 'Tuesday': '6:30-15:30',...",True,False,False,0.02507,False
4,TgDp1TErom3UNglKhDy2uw,zUB7xoTlhbg7_ofHg8Qp0w,W8Z4rXYkmZlEVSaxIVjyvg,5,A truly delightful find! This is the transform...,2017-08-16 16:44:52,Redwood Rotisserie + Grill,Reno,NV,464,"{'RestaurantsTakeOut': 'True', 'Caters': 'True...","Restaurants, American (New), Cocktail Bars, Ba...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-20:0', ...",True,True,False,0.017525,False


In [13]:
reviews["is_shouting"].value_counts()

is_shouting
False    46498
True         7
Name: count, dtype: int64

In [14]:
reviews["length"] = reviews["text"].fillna("").apply(len)
reviews["length"].describe()

count    46505.000000
mean       537.289754
std        493.258350
min         10.000000
25%        218.000000
50%        383.000000
75%        685.000000
max       4999.000000
Name: length, dtype: float64

In [15]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False
    
reviews = reviews[reviews["text"].fillna("").apply(is_english)]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm
import pandas as pd

# -----------------------------
# Load your DataFrame
# -----------------------------
# reviews = pd.read_csv("your_reviews.csv")
# make sure reviews["text"] exists
texts = reviews["text"].fillna("").tolist()

# -----------------------------
# Setup ABSA pipeline
# -----------------------------
absa_model_name = "yangheng/distilbert-base-uncased-absa"

absa_tokenizer = AutoTokenizer.from_pretrained(absa_model_name, use_fast=False)
absa_model = AutoModelForSequenceClassification.from_pretrained(absa_model_name)

absa_pipeline = pipeline(
    "text-classification",
    model=absa_model,
    tokenizer=absa_tokenizer,
    device=-1,  # CPU,
    truncation=True,
    batch_size=16
)

# Overall sentiment pipeline (smaller model, faster)
overall_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1,  # CPU,
    truncation=True,
    batch_size=32
)

# -----------------------------
# Batch processing function
# -----------------------------
def batch_process(texts, pipe, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i+batch_size]
        batch_results = pipe(batch)
        results.extend(batch_results)
    return results

# -----------------------------
# ABSA for each aspect
# -----------------------------
aspects = ["food", "service", "atmosphere"]

# Optional: sort by length for speed
sorted_indices = sorted(range(len(texts)), key=lambda i: len(texts[i]))
texts_sorted = [texts[i] for i in sorted_indices]

for aspect in aspects:
    print(f"Processing aspect: {aspect}")
    absa_inputs = [f"{t} [ASP] {aspect}" for t in texts_sorted]
    absa_results = batch_process(absa_inputs, absa_pipeline, batch_size=16)
    
    # Extract labels
    labels = [r["label"] for r in absa_results]
    
    # Put back in original order
    col = [None]*len(texts)
    for idx, label in zip(sorted_indices, labels):
        col[idx] = label
    reviews[f"{aspect}_sentiment"] = col

# -----------------------------
# Overall sentiment
# -----------------------------
overall_results = batch_process(texts_sorted, overall_pipeline, batch_size=64)
labels = [r["label"] for r in overall_results]

overall_col = [None]*len(texts)
for idx, label in zip(sorted_indices, labels):
    overall_col[idx] = label
reviews["overall_sentiment"] = overall_col

print("Done! ABSA and overall sentiment stored in DataFrame.")


Processing aspect: food


Processing batches:  90%|█████████ | 2621/2904 [5:39:52<36:41,  7.78s/it]  


KeyboardInterrupt: 