In [None]:
import numpy as np
import pandas as pd
from transformers import pipeline

In [None]:
reviews = pd.read_csv("restaurant_reviews_sample.csv")

In [None]:
reviews.columns

In [None]:
reviews["state"].value_counts()

In [None]:
reviews["stars_x"].value_counts()

In [None]:
reviews["text"][0]

In [None]:
reviews["has_exclamation"] = reviews["text"].fillna("").str.contains("!")

In [None]:
reviews = reviews.drop(["useful", "funny", "cool", "address", "postal_code", "latitude", "longitude", "is_open", "stars_y"], axis=1)

In [None]:
reviews["has_question"] = reviews["text"].fillna("").str.contains("?", regex=False)

In [None]:
# Check if review has uppercase words aka shouting
reviews["uppercase_ratio"] = reviews["text"].fillna("").apply(
    lambda x: sum(c.isupper() for c in x) / max(1, len(x))
)
reviews["is_shouting"] = reviews["uppercase_ratio"] > 0.3

In [None]:
reviews[reviews["is_shouting"]]["text"].iloc[5]

In [None]:
reviews.head()

In [None]:
reviews["is_shouting"].value_counts()

In [None]:
reviews["length"] = reviews["text"].fillna("").apply(len)
reviews["length"].describe()

In [None]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False
    
reviews = reviews[reviews["text"].fillna("").apply(is_english)]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm
import pandas as pd

texts = reviews["text"].fillna("").tolist()

# -----------------------------
# Setup ABSA pipeline
# -----------------------------
absa_model_name = "yangheng/distilbert-base-uncased-absa"

absa_tokenizer = AutoTokenizer.from_pretrained(absa_model_name, use_fast=False)
absa_model = AutoModelForSequenceClassification.from_pretrained(absa_model_name)

absa_pipeline = pipeline(
    "text-classification",
    model=absa_model,
    tokenizer=absa_tokenizer,
    device=-1,  # CPU,
    truncation=True,
    batch_size=16
)

# Overall sentiment pipeline (smaller model, faster)
overall_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1,  # CPU,
    truncation=True,
    batch_size=32
)

# -----------------------------
# Batch processing function
# -----------------------------
def batch_process(texts, pipe, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i+batch_size]
        batch_results = pipe(batch)
        results.extend(batch_results)
    return results

# -----------------------------
# ABSA for each aspect
# -----------------------------
aspects = ["food", "service", "atmosphere"]

# Sort by length for speed
sorted_indices = sorted(range(len(texts)), key=lambda i: len(texts[i]))
texts_sorted = [texts[i] for i in sorted_indices]

for aspect in aspects:
    print(f"Processing aspect: {aspect}")
    absa_inputs = [f"{t} [ASP] {aspect}" for t in texts_sorted]
    absa_results = batch_process(absa_inputs, absa_pipeline, batch_size=16)
    
    # Extract labels
    labels = [r["label"] for r in absa_results]
    
    # Put back in original order
    col = [None]*len(texts)
    for idx, label in zip(sorted_indices, labels):
        col[idx] = label
    reviews[f"{aspect}_sentiment"] = col

# -----------------------------
# Overall sentiment
# -----------------------------
overall_results = batch_process(texts_sorted, overall_pipeline, batch_size=64)
labels = [r["label"] for r in overall_results]

overall_col = [None]*len(texts)
for idx, label in zip(sorted_indices, labels):
    overall_col[idx] = label
reviews["overall_sentiment"] = overall_col

print("Done! ABSA and overall sentiment stored in DataFrame.")
