### Installing Required Dependencies

This cell installs all external libraries required for the project to run smoothly in the Google Colab environment. These include packages for transformer-based models, dataset processing, metrics evaluation, and visualization. This step ensures that all necessary tools are available before executing the main pipeline.

In [None]:
# Install required dependencies
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

packages = [
    "openpyxl",
    "langdetect",
    "pandas",
    "numpy",
    "transformers",
    "datasets",
    "evaluate",
    "torch",
    "matplotlib",
    "seaborn",
    "accelerate",
    "plotly"
]

for package in packages:
    try:
        if package == "transformers":
            import transformers
        elif package == "datasets":
            import datasets
        elif package == "evaluate":
            import evaluate
        elif package == "torch":
            import torch
        elif package == "accelerate":
            import accelerate
        elif package == "plotly":
            import plotly
        else:
            __import__(package.replace("-", "_"))
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)

# Import libraries
import pandas as pd
import numpy as np
import os
import re
import glob
import time
import gc
from itertools import product
from langdetect import detect, DetectorFactory
from google.colab import drive
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set seed for reproducibility
DetectorFactory.seed = 0
np.random.seed(42)
torch.manual_seed(42)

# Check for GPU and setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Memory management for Colab
def cleanup_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

### Mounting Google Drive

This cell mounts the user's Google Drive to the current Colab session. All datasets, model checkpoints, and result files are stored and accessed through Google Drive to maintain persistence and avoid data loss between sessions. This step is essential for managing larger files and organizing experimental outputs.

In [None]:
# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Read and Combine Data
print("Loading dataset files...")
data_path = '/content/drive/MyDrive/Thesis/dataset/Indonesia/'
xlsx_files = glob.glob(os.path.join(data_path, '*.xlsx'))

if not xlsx_files:
    print(f"No .xlsx files found in {data_path}")
    print("Please check the path and ensure files exist.")
else:
    print(f"Found {len(xlsx_files)} .xlsx files")

all_reviews = []

def clean_text(text):
    """Clean text by removing HTML, emojis, non-ASCII, and extra spaces"""
    if pd.isna(text):
        return ""

    text = str(text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip().lower()

    return text

def is_english(text):
    """Check if text is predominantly in English"""
    try:
        if len(text.strip()) < 10:
            return False
        return detect(text) == 'en'
    except:
        return False

for file_path in xlsx_files:
    try:
        print(f"Processing {os.path.basename(file_path)}...")
        df = pd.read_excel(file_path)

        if 'Review Title' not in df.columns or 'Review Text' not in df.columns or 'Rating' not in df.columns:
            print(f"Skipping {file_path}: Missing required columns")
            continue

        df['text'] = df['Review Title'].astype(str) + ' ' + df['Review Text'].astype(str)
        df = df.dropna(subset=['text', 'Rating'])
        df = df[df['text'].str.len() > 10]
        df['text'] = df['text'].apply(clean_text)
        df = df[df['text'].str.len() > 5]

        print(f"Filtering for English text...")
        df['is_english'] = df['text'].apply(is_english)
        df = df[df['is_english'] == True]
        df = df.drop('is_english', axis=1)
        df = df[['text', 'Rating']].copy()

        all_reviews.append(df)
        print(f"Added {len(df)} English reviews from {os.path.basename(file_path)}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

if not all_reviews:
    print("No valid data found. Please check your files.")
else:
    combined_df = pd.concat(all_reviews, ignore_index=True)
    print(f"\nTotal combined reviews: {len(combined_df)}")

# Aspect Labeling
print("\nApplying aspect labeling...")

digital_keywords = [
    'digital', 'wifi', 'internet', 'charging', 'application', 'self check-in',
    'self service', 'qr code', 'technology', 'website', 'touchscreen',
    'cashless', 'online booking', 'digital map', 'barcode', 'scan',
    'mobile app', 'online system', 'connectivity', 'device'
]

service_keywords = [
    'service', 'staff', 'employee', 'security', 'cashier', 'greeting',
    'customer service', 'hospitality', 'slow', 'helpful', 'unhelpful',
    'rude', 'polite', 'friendly', 'unfriendly', 'queue', 'waiting',
    'assistance', 'responsiveness', 'behavior'
]

def assign_aspect(text):
    text_lower = text.lower()
    for keyword in digital_keywords:
        if keyword in text_lower:
            return 'digital_accessibility'
    for keyword in service_keywords:
        if keyword in text_lower:
            return 'customer_service'
    return 'other'

combined_df['aspect'] = combined_df['text'].apply(assign_aspect)

# Sentiment Labeling based on Rating
print("Applying sentiment labeling based on ratings...")

def assign_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

combined_df['sentiment'] = combined_df['Rating'].apply(assign_sentiment)

# Data Filtering
print("\nFiltering data...")
print(f"Before filtering: {len(combined_df)} samples")

combined_df = combined_df[combined_df['aspect'] != 'other']
print(f"After removing 'other' aspect: {len(combined_df)} samples")

combined_df = combined_df[combined_df['sentiment'] != 'neutral']
print(f"After removing 'neutral' sentiment: {len(combined_df)} samples")

print("\nFinal class distribution:")
print("Aspect distribution:")
print(combined_df['aspect'].value_counts())
print("\nSentiment distribution:")
print(combined_df['sentiment'].value_counts())

# Dataset Split and Label Encoding
print("\nSplitting dataset and encoding labels...")

aspect_labels = sorted(combined_df['aspect'].unique())
sentiment_labels = sorted(combined_df['sentiment'].unique())

aspect_label2id = {label: i for i, label in enumerate(aspect_labels)}
aspect_id2label = {i: label for label, i in aspect_label2id.items()}

sentiment_label2id = {label: i for i, label in enumerate(sentiment_labels)}
sentiment_id2label = {i: label for label, i in sentiment_label2id.items()}

print(f"Aspect labels: {aspect_labels}")
print(f"Sentiment labels: {sentiment_labels}")

combined_df['aspect_labels'] = combined_df['aspect'].map(aspect_label2id)
combined_df['sentiment_labels'] = combined_df['sentiment'].map(sentiment_label2id)

# Split data
X = combined_df['text']
y_aspect = combined_df['aspect_labels']
y_sentiment = combined_df['sentiment_labels']

stratify_key = combined_df['aspect'].astype(str) + '_' + combined_df['sentiment'].astype(str)

X_temp, X_test, y_aspect_temp, y_aspect_test, y_sentiment_temp, y_sentiment_test = train_test_split(
    X, y_aspect, y_sentiment, test_size=0.15, random_state=42, stratify=stratify_key
)

# Create temp stratify key
temp_df = pd.DataFrame({
    'text': X_temp,
    'aspect_labels': y_aspect_temp,
    'sentiment_labels': y_sentiment_temp
})
temp_df['aspect'] = temp_df['aspect_labels'].map(aspect_id2label)
temp_df['sentiment'] = temp_df['sentiment_labels'].map(sentiment_id2label)
stratify_temp = temp_df['aspect'].astype(str) + '_' + temp_df['sentiment'].astype(str)

X_train, X_val, y_aspect_train, y_aspect_val, y_sentiment_train, y_sentiment_val = train_test_split(
    X_temp, y_aspect_temp, y_sentiment_temp, test_size=0.176, random_state=42, stratify=stratify_temp
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

### Model and Hyperparameter Configuration

This section defines the core configurations for the fine-tuning process. It includes model names, hyperparameters such as learning rate and number of epochs, and other training arguments. Setting these values upfront makes the training process reproducible and easier to manage, especially when experimenting with multiple models.

In [None]:
# Model and Hyperparameter Configuration
print("\nSetting up models and hyperparameters...")

# Models to compare
models_config = {
    'distilbert-base-uncased': {
        'name': 'DistilBERT',
        'model_name': 'distilbert-base-uncased'
    },
    'bert-base-uncased': {
        'name': 'BERT',
        'model_name': 'bert-base-uncased'
    },
    'roberta-base': {
        'name': 'RoBERTa',
        'model_name': 'roberta-base'
    }
}

# Hyperparameters to tune (optimized for free Colab T4 GPU)
hyperparams = {
    'learning_rate': [2e-5, 5e-5],  # Reduced from 3 to 2 options
    'per_device_train_batch_size': [8, 16],  # Keep 2 options
    'num_train_epochs': [2, 3]  # Keep 2 options but reduced max
}

# Generate all combinations
hyperparam_combinations = list(product(
    hyperparams['learning_rate'],
    hyperparams['per_device_train_batch_size'],
    hyperparams['num_train_epochs']
))

print(f"Total hyperparameter combinations per model: {len(hyperparam_combinations)}")
print(f"Total experiments (ASPECT ONLY): {len(models_config) * len(hyperparam_combinations)}")  # Only aspect classification

# Training and Evaluation Functions
def create_datasets(X_train, X_val, X_test, y_train, y_val, y_test, tokenizer, max_length=256):
    """Create tokenized datasets"""
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding=True, max_length=max_length)

    train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'labels': y_train.tolist()})
    val_dataset = Dataset.from_dict({'text': X_val.tolist(), 'labels': y_val.tolist()})
    test_dataset = Dataset.from_dict({'text': X_test.tolist(), 'labels': y_test.tolist()})

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    return train_dataset, val_dataset, test_dataset

### Loading Evaluation Metrics

This cell imports the evaluation functions used to assess the performance of each model. These typically include precision, recall, F1-score, and accuracy. Defining these metrics explicitly allows for consistent evaluation across all models and hyperparameter settings.


In [None]:
# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')

    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1']
    }

def train_model(model_name, train_dataset, val_dataset, test_dataset, num_labels,
                id2label, label2id, task_name, lr, batch_size, epochs):
    """Train a single model with given hyperparameters"""

    print(f"Training {model_name} - {task_name} | LR: {lr}, BS: {batch_size}, Epochs: {epochs}")

    # Initialize model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_{task_name}_{model_name.replace("/", "_")}',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        report_to=[],
        dataloader_pin_memory=False,
        gradient_accumulation_steps=2 if batch_size == 8 else 1  # Help with small batch sizes
    )

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Initialize trainer with early stopping
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    # Train model
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time

    # Evaluate on test set
    test_results = trainer.evaluate(test_dataset)

    # Get predictions
    predictions = trainer.predict(test_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids

    # Cleanup
    del model, trainer
    cleanup_memory()

    return {
        'model_name': model_name,
        'task': task_name,
        'learning_rate': lr,
        'batch_size': batch_size,
        'epochs': epochs,
        'test_accuracy': test_results['eval_accuracy'],
        'test_f1': test_results['eval_f1'],
        'training_time': training_time,
        'predictions': y_pred,
        'true_labels': y_true
    }

### Running Hyperparameter Tuning for All Models

This section executes the training and hyperparameter tuning for all candidate models. Each model is fine-tuned on the training set using the configurations defined earlier, and performance is evaluated to select the best-performing setup. This step is computationally intensive and may take a significant amount of time depending on the number of models and hyperparameters tested.


In [None]:
# Run Hyperparameter Tuning for All Models
print("\n" + "="*80)
print("STARTING HYPERPARAMETER TUNING")
print("="*80)

results = []

for model_key, model_info in models_config.items():
    print(f"\n{'='*60}")
    print(f"TRAINING {model_info['name'].upper()} MODELS")
    print(f"{'='*60}")

    model_name = model_info['model_name']

    # Initialize tokenizer for this model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create datasets for aspect classification only
    aspect_train_ds, aspect_val_ds, aspect_test_ds = create_datasets(
        X_train, X_val, X_test, y_aspect_train, y_aspect_val, y_aspect_test, tokenizer
    )

    # Train with each hyperparameter combination (ASPECT ONLY)
    for lr, batch_size, epochs in hyperparam_combinations:
        try:
            # Train aspect model only
            aspect_result = train_model(
                model_name, aspect_train_ds, aspect_val_ds, aspect_test_ds,
                len(aspect_labels), aspect_id2label, aspect_label2id,
                'aspect', lr, batch_size, epochs
            )
            results.append(aspect_result)

        except Exception as e:
            print(f"Error training {model_name} with LR={lr}, BS={batch_size}, Epochs={epochs}: {e}")
            cleanup_memory()
            continue

### Result Analysis and Visualization

After training is complete, this cell analyzes the evaluation results for each model. It may include visual comparisons such as bar plots or line charts for F1-scores, confusion matrices, or training loss curves. The goal is to interpret which model performs best under the defined criteria and justify its selection.


In [None]:
# Results Analysis and Visualization
print("\n" + "="*80)
print("ANALYZING RESULTS")
print("="*80)

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(f"Total completed experiments: {len(results_df)}")

# Best results for each model-task combination
print("\nBest Results by Model and Task:")
best_results = results_df.loc[results_df.groupby(['model_name', 'task'])['test_f1'].idxmax()]

for _, row in best_results.iterrows():
    model_display = models_config.get(row['model_name'], {}).get('name', row['model_name'])
    print(f"{model_display} - {row['task'].title()}: F1={row['test_f1']:.4f}, "
          f"Acc={row['test_accuracy']:.4f} | LR={row['learning_rate']}, BS={row['batch_size']}, E={row['epochs']}")

# Comprehensive Visualizations
print("\nCreating visualizations...")

# Performance comparison plots - ASPECT ONLY
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Aspect Classification - Accuracy', 'Aspect Classification - F1 Score'],
    horizontal_spacing=0.15
)

# Only aspect results
aspect_results = results_df[results_df['task'] == 'aspect']

# Group by model for box plots
for metric_idx, metric in enumerate(['test_accuracy', 'test_f1']):
    col = metric_idx + 1

    for model_name in aspect_results['model_name'].unique():
        model_data = aspect_results[aspect_results['model_name'] == model_name]
        model_display = models_config.get(model_name, {}).get('name', model_name)

        fig.add_trace(
            go.Box(y=model_data[metric], name=model_display,
                  boxpoints='all', jitter=0.3, pointpos=-1.8),
            row=1, col=col
        )

fig.update_layout(height=400, title_text="Aspect Classification: Model Performance Comparison")
fig.show()

# Best hyperparameters heatmap
print("\nCreating hyperparameter analysis...")

# Performance by hyperparameters
hyperparam_analysis = results_df.groupby(['task', 'learning_rate', 'batch_size', 'epochs']).agg({
    'test_f1': 'mean',
    'test_accuracy': 'mean',
    'training_time': 'mean'
}).reset_index()

# Create heatmap for aspect classification only
task_data = hyperparam_analysis[hyperparam_analysis['task'] == 'aspect']

# Pivot for heatmap
heatmap_data = task_data.pivot_table(
    values='test_f1',
    index=['learning_rate', 'epochs'],
    columns='batch_size',
    aggfunc='mean'
)

plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='RdYlGn')
plt.title('Aspect Classification - F1 Score by Hyperparameters')
plt.xlabel('Batch Size')
plt.ylabel('Learning Rate, Epochs')
plt.show()

# Detailed Results Table
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

# Create comprehensive results table
summary_table = []
for _, row in best_results.iterrows():
    model_display = models_config.get(row['model_name'], {}).get('name', row['model_name'])

    # Get confusion matrix for best model
    if row['task'] == 'aspect':
        target_names = [aspect_id2label[i] for i in range(len(aspect_labels))]
    else:
        target_names = [sentiment_id2label[i] for i in range(len(sentiment_labels))]

    summary_table.append({
        'Model': model_display,
        'Task': row['task'].title(),
        'Accuracy': f"{row['test_accuracy']:.4f}",
        'F1-Score': f"{row['test_f1']:.4f}",
        'Learning Rate': row['learning_rate'],
        'Batch Size': int(row['batch_size']),
        'Epochs': int(row['epochs']),
        'Training Time (s)': f"{row['training_time']:.1f}"
    })

summary_df = pd.DataFrame(summary_table)
print(summary_df.to_string(index=False))

### Confusion Matrices for Best Models (Aspect Prediction Only)

This final analysis cell focuses on generating and displaying confusion matrices for the best-performing models, specifically on the aspect classification task. Confusion matrices provide a granular view of model predictions, highlighting which aspect classes are frequently confused and where the model excels or struggles.


In [None]:
# Confusion Matrices for Best Models (Aspect Only)
print("\nConfusion Matrices for Best Aspect Models:")

for _, row in best_results.iterrows():
    model_display = models_config.get(row['model_name'], {}).get('name', row['model_name'])
    target_names = [aspect_id2label[i] for i in range(len(aspect_labels))]

    cm = confusion_matrix(row['true_labels'], row['predictions'])

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=target_names, yticklabels=target_names)
    plt.title(f'{model_display} - Aspect Classification (Best Model)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # Print classification report
    print(f"\n{model_display} - Aspect Classification Report:")
    print(classification_report(row['true_labels'], row['predictions'],
                              target_names=target_names))

print("\n" + "="*80)
print("ASPECT CLASSIFICATION HYPERPARAMETER TUNING COMPLETED!")
print("="*80)

# Final cleanup
cleanup_memory()