In [9]:
import pandas as pd
import json
from tqdm import tqdm
import subprocess
from sklearn.metrics import classification_report, f1_score
import warnings

warnings.filterwarnings('ignore')

### Load data

In [2]:
# Load CSV
df = pd.read_csv('../data/cleaned_for_LLM.csv') 
print(f"Loaded {len(df):,} rows\n")
df.head(3)

# Filter out manual labeling set
absa_ids = set(pd.read_csv('../data/absa_training_set.csv', usecols=['review_id'])['review_id'].astype(str))
df = df[~df['review_id'].isin(absa_ids)].reset_index(drop=True)
print(f"After filtering absa set: {len(df):,} rows")

Loaded 22,624,379 rows



Unnamed: 0,review_id,user_id,rating,text,gmap_id
0,456e420929727f933dbaed63eff45cde53c7b92438cf0d...,1.067134e+20,5.0,"Easy process, extremely friendly, helpful staf...",0x80960c29f2e3bf29:0x4b291f0d275a5699
1,ea2ad448a8b443c1c42c5d4ca9dd84d02fe9f2f110b993...,1.024963e+20,5.0,My girlfriends and I took a weekend ski trip t...,0x80960c29f2e3bf29:0x4b291f0d275a5699
2,77efbe6a6f4d27512b59bb2f878b0ac8b533aa03a11fb7...,1.102407e+20,5.0,The team at Black Tie never disappoints our se...,0x80960c29f2e3bf29:0x4b291f0d275a5699


### Sample data

In [3]:
# LLM inference is slow
SAMPLE_SIZE = 1000  # Start small and increase slowly

print(f"Sampling {SAMPLE_SIZE:,} reviews...")
df_sample = df.sample(n=SAMPLE_SIZE, random_state=2)
print(f"Sample size: {len(df_sample):,}\n")

Sampling 1,000 reviews...
Sample size: 1,000



# Define Prompt 

In [4]:
def create_prompt(review_text):
    """Create ABSA prompt for LLM."""
    prompt = f"""Analyze this review and extract aspect-sentiment pairs.

Review: {review_text}

Identify mentions of these aspects:
- food_quality
- service
- wait_time
- price_value
- cleanliness
- atmosphere

For each aspect mentioned, determine sentiment: positive, negative, or neutral.

Respond ONLY with valid JSON in this format:
{{"food_quality": "positive", "service": "negative"}}

If an aspect is not mentioned, do not include it.

JSON:"""
    return prompt

# Ollama API Call

***Note:*** Must download Ollama first @ `ollama.ai`

In [5]:
def query_ollama(prompt, model="mistral"):
    """Query local Ollama model."""
    try:
        result = subprocess.run(
            ['ollama', 'run', model],
            input=prompt,
            text=True,
            capture_output=True,
            timeout=30
        )
        return result.stdout.strip()
    except Exception as e:
        return f"Error: {str(e)}"

# Test if Ollama is installed
try:
    test = subprocess.run(['ollama', '--version'], capture_output=True)
    print("Ollama detected ✓")
except:
    print("ERROR: Ollama not installed. Install from https://ollama.ai")

Ollama detected ✓


# Run LLM 

In [6]:
def extract_aspects_llm(review_text):
    """Extract aspect-sentiment pairs using LLM."""
    prompt = create_prompt(review_text)
    response = query_ollama(prompt, model="mistral")
    
    try:
        # Parse JSON response
        aspects = json.loads(response)
        return aspects
    except:
        # If JSON parsing fails, return empty
        return {}

# Run inference on sample
print(f"Running LLM inference on {len(df_sample):,} reviews...")
results = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    aspects = extract_aspects_llm(row['text'])
    results.append({
        'review_id': row.get('review_id', idx),
        'rating': row['rating'],
        'text': row['text'],
        'llm_predictions': aspects
    })

results_df = pd.DataFrame(results)
print(f"\nCompleted {len(results_df):,} predictions\n")

Running LLM inference on 1,000 reviews...


100%|███████████████████████████████████████| 1000/1000 [48:42<00:00,  2.92s/it]


Completed 1,000 predictions






### Load Aspects

In [7]:
aspects = ['food_quality', 'service', 'wait_time', 'price_value', 'cleanliness', 'atmosphere']
sentiments = ['positive', 'negative', 'neutral']
label_cols = [f'{aspect}_{sentiment}' for aspect in aspects for sentiment in sentiments]

# Initialize all labels to 0
for col in label_cols:
    results_df[col] = 0

# Fill in predictions
for idx, row in results_df.iterrows():
    preds = row['llm_predictions']
    for aspect, sentiment in preds.items():
        if aspect in aspects and sentiment in sentiments:
            results_df.at[idx, f'{aspect}_{sentiment}'] = 1

print("Label distribution:")
for aspect in aspects:
    print(f"\n{aspect}:")
    for sentiment in sentiments:
        col = f'{aspect}_{sentiment}'
        print(f"  {sentiment:<10} {results_df[col].sum():>12,}")

Label distribution:

food_quality:
  positive            381
  negative             15
  neutral             140

service:
  positive            146
  negative            158
  neutral             288

wait_time:
  positive             15
  negative             37
  neutral             272

price_value:
  positive            105
  negative             34
  neutral             229

cleanliness:
  positive             90
  negative             24
  neutral             249

atmosphere:
  positive            312
  negative             22
  neutral             182


## Ground Truth 

In [8]:
# Apply same weak supervision as other models
def get_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating >= 4:
        return 'positive'
    else:
        return 'neutral'

aspect_keywords = {
    'food_quality': ['food', 'meal', 'dish', 'taste', 'flavor', 'delicious', 'fresh',
                     'menu', 'order', 'coffee', 'drink', 'burger', 'chicken', 'pizza'],
    'service': ['service', 'staff', 'employee', 'manager', 'worker', 'associate',
                'waiter', 'waitress', 'server', 'cashier', 'friendly', 'rude',
                'helpful', 'unprofessional', 'told', 'said', 'asked'],
    'wait_time': ['wait', 'waiting', 'slow', 'fast', 'quick', 'minutes', 'hour',
                  'line', 'queue', 'busy', 'long', 'forever', 'delay', 'prompt'],
    'price_value': ['price', 'prices', 'expensive', 'cheap', 'cost', 'value', 'worth',
                    'affordable', 'overpriced', 'reasonable', 'pricey', 'money', 'deal'],
    'cleanliness': ['clean', 'dirty', 'filthy', 'hygiene', 'gross', 'spotless',
                    'mess', 'messy', 'tidy', 'maintained'],
    'atmosphere': ['atmosphere', 'ambiance', 'decor', 'seating', 'crowded', 'quiet',
                   'noisy', 'comfortable', 'cozy', 'parking', 'location', 'space']
}

# Create ground truth labels
results_df['sentiment'] = results_df['rating'].map(get_sentiment)

for aspect, keywords in aspect_keywords.items():
    pattern = '|'.join(keywords)
    aspect_mentioned = results_df['text'].str.lower().str.contains(pattern, na=False)
    for sentiment in sentiments:
        results_df[f'{aspect}_{sentiment}_true'] = ((aspect_mentioned) & (results_df['sentiment'] == sentiment)).astype(int)

In [11]:
# Get predictions and ground truth
y_true = results_df[[f'{col}_true' for col in label_cols]].values
y_pred = results_df[label_cols].values

print("LLM (Mistral - 1K Sample):")
print()
print(f"F1 Score (macro):    {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print(f"F1 Score (weighted): {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
print()
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_cols, zero_division=0))

LLM (Mistral - 1K Sample):

F1 Score (macro):    0.2919
F1 Score (weighted): 0.4819

Classification Report:
                       precision    recall  f1-score   support

food_quality_positive       0.65      0.87      0.75       285
food_quality_negative       0.60      0.20      0.31        44
 food_quality_neutral       0.09      0.41      0.15        32
     service_positive       0.74      0.43      0.54       253
     service_negative       0.25      0.89      0.39        44
      service_neutral       0.01      0.12      0.02        26
   wait_time_positive       0.67      0.08      0.15       119
   wait_time_negative       0.32      0.43      0.37        28
    wait_time_neutral       0.01      0.21      0.03        19
 price_value_positive       0.73      0.66      0.69       117
 price_value_negative       0.18      0.46      0.26        13
  price_value_neutral       0.01      0.11      0.02        19
 cleanliness_positive       0.48      0.88      0.62        49
 cleanlin