In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score



### Load data

In [2]:
# Load CSV
df = pd.read_csv('../data/cleaned_for_modeling.csv')
print(f"Loaded {len(df):,} rows\n")

Loaded 24,188,451 rows



### Process data

In [3]:
# Define functions

def is_likely_english(text):
    """Fast heuristic: >80% ASCII characters = likely English."""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return False
    text = str(text)
    ascii_count = sum(1 for c in text if ord(c) < 128)
    return (ascii_count / len(text)) > 0.8

def filter_english_only(df):
    """Filter to English-only reviews."""
    print(f"Before English filter: {len(df):,} rows")
    df_clean = df.copy()
    df_clean['is_english'] = df_clean['text'].apply(is_likely_english)
    df_clean = df_clean[df_clean['is_english']].drop(columns=['is_english'])
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

def filter_original_tag(df):
    """Remove reviews with Google's (Original) translation tag."""
    print(f"Before (Original) tag filter: {len(df):,} rows")
    df_clean = df[~df['text'].str.contains(r'\(Original\)', na=False)].copy()
    print(f"  After: {len(df_clean):,} rows ({len(df) - len(df_clean):,} removed)\n")
    return df_clean

In [4]:
# Apply english filtering pipeline

df = (df
    .pipe(filter_english_only)
    .pipe(filter_original_tag)
    .reset_index(drop=True)
)

print(f"Final dataset: {len(df):,} rows")
df.info()

Before English filter: 24,188,451 rows
  After: 24,075,251 rows (113,200 removed)

Before (Original) tag filter: 24,075,251 rows
  After: 22,640,876 rows (1,434,375 removed)

Final dataset: 22,640,876 rows
<class 'pandas.DataFrame'>
RangeIndex: 22640876 entries, 0 to 22640875
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  float64
 1   rating   float64
 2   text     str    
 3   gmap_id  str    
dtypes: float64(2), str(2)
memory usage: 4.0 GB


### Sample data

In [5]:
# Function to sample

def create_stratified_sample(df, sample_size=250000, random_state=22):  # 250K sample (adjust size as needed)
    """Create stratified sample maintaining rating distribution."""
    print(f"Creating stratified sample of {sample_size:,} reviews...")
    
    df_sample, _ = train_test_split(
        df, 
        train_size=sample_size, 
        stratify=df['rating'], 
        random_state=random_state
    )
    
    print(f"\nRating distribution:")
    print(df_sample['rating'].value_counts().sort_index())
    print("\nPercentages:")
    print((df_sample['rating'].value_counts(normalize=True).sort_index() * 100).round(2))
    print()
    
    return df_sample

# Create sample
df_250K = create_stratified_sample(df)

Creating stratified sample of 250,000 reviews...

Rating distribution:
rating
1.0     15322
2.0      9417
3.0     21516
4.0     50368
5.0    153377
Name: count, dtype: int64

Percentages:
rating
1.0     6.13
2.0     3.77
3.0     8.61
4.0    20.15
5.0    61.35
Name: proportion, dtype: float64



### Aspect keywords

In [6]:
# 5 basic aspects to get model going
# We need to nail down our aspect list

aspect_keywords = {
    'service': [
        'staff', 'waiter', 'waitress', 'server', 'employee', 'service', 
        'manager', 'cashier', 'worker', 'attendant', 'helpful', 'rude', 
        'friendly', 'slow service', 'customer service'
    ],
    'quality': [
        'food', 'quality', 'taste', 'delicious', 'fresh', 'stale', 'bland', 
        'flavor', 'product', 'meal', 'dish', 'overcooked', 'undercooked', 
        'cold', 'burnt'
    ],
    'cleanliness': [
        'clean', 'dirty', 'filthy', 'hygiene', 'sanitary', 'bathroom', 
        'restroom', 'table', 'floor', 'messy', 'spotless', 'gross'
    ],
    'value': [
        'price', 'expensive', 'cheap', 'overpriced', 'affordable', 'value', 
        'worth', 'cost', 'money', 'budget'
    ],
    'wait_time': [
        'wait', 'waiting', 'slow', 'fast', 'quick', 'long wait', 'forever', 
        'time', 'delayed', 'prompt'
    ]
}

aspects = list(aspect_keywords.keys())
sentiments = ['positive', 'negative', 'neutral']

print("Aspect keywords defined:")
for aspect, keywords in aspect_keywords.items():
    print(f"  {aspect}: {len(keywords)} keywords")

Aspect keywords defined:
  service: 15 keywords
  quality: 15 keywords
  cleanliness: 12 keywords
  value: 10 keywords
  wait_time: 10 keywords


In [7]:
# Functions to extract aspects from text

def extract_aspect_labels(text, rating):
    """
    Extract aspects mentioned in text and assign sentiment based on rating.
    
    Args:
        text: Review text
        rating: Overall rating (1-5)
        
    Returns:
        dict: {aspect: sentiment} for aspects found in text
    """
    text_lower = str(text).lower()
    aspect_dict = {}
    
    for aspect, keywords in aspect_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            if rating <= 2:
                aspect_dict[aspect] = 'negative'
            elif rating >= 4:
                aspect_dict[aspect] = 'positive'
            else:
                aspect_dict[aspect] = 'neutral'
    
    return aspect_dict

def clean_text(df):
    """Clean text column."""
    df_clean = df.copy()
    df_clean['text'] = df_clean['text'].fillna('').astype(str)
    return df_clean

def apply_aspect_extraction(df):
    """Extract aspect labels from reviews."""
    print("Extracting aspect labels...")
    df_labeled = df.copy()
    
    df_labeled['aspect_labels'] = df_labeled.apply(
        lambda row: extract_aspect_labels(row['text'], row['rating']), 
        axis=1
    )
    
    print(f"  Processed {len(df_labeled):,} reviews\n")
    return df_labeled

In [8]:
# Apply aspect extraction pipeline

llm_test_250K = (df_250K
    .pipe(clean_text)
    .pipe(apply_aspect_extraction)
)

Extracting aspect labels...
  Processed 250,000 reviews



### Feature Engineering

In [9]:
# Function to convert labels to columns
def create_binary_label_columns(df):
    """Convert aspect_labels dict to binary columns."""
    print("Creating binary aspect-sentiment columns...")
    df_binary = df.copy()
    
    # Initialize columns
    for aspect in aspects:
        for sentiment in sentiments:
            df_binary[f'{aspect}_{sentiment}'] = 0
    
    # Fill based on aspect_labels
    for idx, row in df_binary.iterrows():
        for aspect, sentiment in row['aspect_labels'].items():
            df_binary.at[idx, f'{aspect}_{sentiment}'] = 1
    
    # Get label column names
    label_cols = [f'{aspect}_{sentiment}' for aspect in aspects for sentiment in sentiments]
    
    # Convert to float
    for col in label_cols:
        df_binary[col] = df_binary[col].astype(float)
    
    print(f"  Created {len(label_cols)} binary label columns\n")
    return df_binary, label_cols

# Create binary columns
llm_test_250K, label_cols = create_binary_label_columns(llm_test_250K)

# Summary
print("Label distribution:")
print(llm_test_250K[label_cols].sum().sort_values(ascending=False))

Creating binary aspect-sentiment columns...
  Created 15 binary label columns

Label distribution:
quality_positive        60713.0
service_positive        60687.0
wait_time_positive      33851.0
value_positive          25628.0
cleanliness_positive    17066.0
service_negative         9228.0
wait_time_negative       7047.0
quality_negative         6984.0
quality_neutral          6461.0
service_neutral          5300.0
wait_time_neutral        4506.0
value_negative           4317.0
value_neutral            3771.0
cleanliness_negative     3158.0
cleanliness_neutral      2461.0
dtype: float64


In [10]:
# Train/test split

def create_train_val_split(df, test_size=0.2, random_state=42):
    """Split data into train and validation sets."""
    print(f"Creating train/val split ({int((1-test_size)*100)}/{int(test_size*100)})...")
    
    train_df, val_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    print(f"  Train size: {len(train_df):,}")
    print(f"  Val size:   {len(val_df):,}\n")
    
    return train_df, val_df

# Split
train_df, val_df = create_train_val_split(llm_test_250K, test_size=0.2, random_state=22)

Creating train/val split (80/20)...
  Train size: 200,000
  Val size:   50,000



In [11]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    """Tokenize text and prepare labels."""
    tokenized = tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )
    
    # Prepare multi-label targets
    labels = []
    for i in range(len(examples['text'])):
        label_row = [float(examples[col][i]) for col in label_cols]
        labels.append(label_row)
    
    tokenized['labels'] = labels
    return tokenized

# Create datasets
print("Creating HuggingFace datasets...")
train_dataset = Dataset.from_pandas(train_df[['text'] + label_cols])
val_dataset = Dataset.from_pandas(val_df[['text'] + label_cols])

# Tokenize
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'] + label_cols)
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['text'] + label_cols)

print(f"Train dataset: {len(train_dataset):,} samples")
print(f"Val dataset:   {len(val_dataset):,} samples\n")

Creating HuggingFace datasets...
Tokenizing datasets...


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train dataset: 200,000 samples
Val dataset:   50,000 samples



*Note: 0% is a display bug in the notebook/library version. 

### Train Model - DistilBERT

In [12]:
# Define evaluation metrics
def compute_metrics(pred):
    """Compute evaluation metrics for multi-label classification."""
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [13]:
# Load model
print("Loading DistilBERT model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)

print(f"Model loaded with {len(label_cols)} output labels\n")

Loading DistilBERT model...


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Model loaded with 15 output labels



In [14]:
# Training configuration
training_args = TrainingArguments(
    output_dir='./results_absa',           # Default: None | (required parameter)
    eval_strategy="epoch",                 # Default: "no" | evaluate each epoch
    save_strategy="epoch",                 # Default: "steps" | save each epoch
    load_best_model_at_end=True,           # Default: False | load best model
    metric_for_best_model='f1',            # Default: "loss" | F1 score
    logging_steps=100,                     # Default: 500 | log more frequently
    per_device_train_batch_size=32,        # Default: 8 | 4x larger batches *(for local training, likely need to reduce for datahub)
    dataloader_num_workers=4,              # Default: 0 | parallel data loading *(for local training, likely need to reduce for datahub)
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train
print("Starting training...\n")
trainer.train()

Starting training...



  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.041512,0.035645,0.9016,0.850103,0.899225,0.839742
2,0.029207,0.034038,0.91218,0.875732,0.914469,0.857032
3,0.021521,0.037442,0.92148,0.886648,0.905401,0.874661


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


TrainOutput(global_step=18750, training_loss=0.033914218928019206, metrics={'train_runtime': 24849.9525, 'train_samples_per_second': 24.145, 'train_steps_per_second': 0.755, 'total_flos': 7.9498865664e+16, 'train_loss': 0.033914218928019206, 'epoch': 3.0})

### Evaluate Model

In [15]:
# Evaluate on validation set
results = trainer.evaluate()

print("\nFinal Validation Results:")
print(f"  Accuracy:  {results['eval_accuracy']:.4f}")
print(f"  F1 Score:  {results['eval_f1']:.4f}")
print(f"  Precision: {results['eval_precision']:.4f}")
print(f"  Recall:    {results['eval_recall']:.4f}")

  super().__init__(loader)



Final Validation Results:
  Accuracy:  0.9215
  F1 Score:  0.8866
  Precision: 0.9054
  Recall:    0.8747
