# 04. Model XGBoost
XGBoost models implementation.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import RobustScaler
import utils

# Load Split Data using shared function
df_train, df_valid, df_test, df_process, split_info = utils.load_split_data_with_combined()

if df_process is None:
    raise RuntimeError("Data not found. Please run 02_process_data.ipynb first.")

In [None]:
# Create Sequences using shared function
n_past_trips = 5
data = utils.prepare_model_data(df_train, df_test, df_process, n_past_trips=n_past_trips)

# Extract variables
X_delays_train, X_features_train, X_agg_train, y_train = \
    data['X_delays_train'], data['X_features_train'], data['X_agg_train'], data['y_train']
X_delays_test, X_features_test, X_agg_test, y_test = \
    data['X_delays_test'], data['X_features_test'], data['X_agg_test'], data['y_test']
n_stops = data['n_stops']

In [3]:
# Prepare Data for XGBoost
if df_process is not None:
    X_train_flat = np.concatenate([
        X_delays_train.reshape(len(X_delays_train), -1),
        X_features_train,
        X_agg_train
    ], axis=1)

    X_test_flat = np.concatenate([
        X_delays_test.reshape(len(X_delays_test), -1),
        X_features_test,
        X_agg_test
    ], axis=1)

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_flat)
    X_test_scaled = scaler.transform(X_test_flat)

In [None]:
# Train XGBoost (Per Stop)
evaluation_results = []

print("Training XGBoost...")
y_pred_xgb_all = []

xgb_params = {
    "n_estimators": 100,
    "max_depth": 5,
    "learning_rate": 0.1,
    "n_jobs": -1,
    "random_state": 42
}

for stop_idx in range(n_stops):
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train_scaled, y_train[:, stop_idx])
    pred = model.predict(X_test_scaled)
    y_pred_xgb_all.append(pred)

y_pred_xgb_all = np.array(y_pred_xgb_all).T

result_xgb = utils.evaluate_model(
    y_test, y_pred_xgb_all,
    model_name="XGBoost (Per Stop)",
    config={"params": xgb_params, "n_past_trips": n_past_trips, "scaler": "RobustScaler"}
)
evaluation_results.append(result_xgb)
print(result_xgb.summary())

# Entity Embeddings for Categorical Variables

カテゴリ変数（route_id, stop_id など）に対して埋め込み表現（Entity Embeddings）を学習します。
これにより、LabelEncodingよりも意味のある表現が得られ、精度向上が期待できます。

Phase 2への準備として、埋め込みの学習と保存を行います。

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import pickle

class EntityEmbedder:
    """
    Entity Embeddings for categorical variables.
    
    Learns dense vector representations for categorical features using a neural network,
    then exports the embeddings for use in tree-based models like XGBoost.
    """
    
    def __init__(self, embedding_dims=None):
        """
        Args:
            embedding_dims: Dict of {column_name: embedding_dimension}
                           If None, uses rule of thumb: min(50, n_categories // 2)
        """
        self.embedding_dims = embedding_dims or {}
        self.encoders = {}
        self.embeddings = {}
        self.model = None
        
    def _get_embedding_dim(self, n_categories, col_name):
        """Get embedding dimension for a categorical column"""
        if col_name in self.embedding_dims:
            return self.embedding_dims[col_name]
        # Rule of thumb: min(50, n_categories // 2), at least 2
        return max(2, min(50, n_categories // 2))
    
    def fit(self, df_train, categorical_cols, target_col, 
            numerical_cols=None, epochs=10, batch_size=256, verbose=1):
        """
        Train entity embeddings using a simple neural network.
        
        Args:
            df_train: Training DataFrame
            categorical_cols: List of categorical column names
            target_col: Target column name
            numerical_cols: List of numerical column names (optional)
            epochs: Number of training epochs
            batch_size: Batch size
            verbose: Verbosity level
        """
        numerical_cols = numerical_cols or []
        
        # Encode categorical variables
        cat_inputs = []
        cat_embeddings = []
        
        for col in categorical_cols:
            le = LabelEncoder()
            df_train[f'{col}_encoded'] = le.fit_transform(df_train[col].astype(str))
            self.encoders[col] = le
            
            n_categories = len(le.classes_)
            emb_dim = self._get_embedding_dim(n_categories, col)
            
            # Create embedding layer
            inp = Input(shape=(1,), name=f'{col}_input')
            emb = Embedding(n_categories, emb_dim, name=f'{col}_embedding')(inp)
            emb = Flatten()(emb)
            
            cat_inputs.append(inp)
            cat_embeddings.append(emb)
            
            print(f"  {col}: {n_categories} categories -> {emb_dim}D embedding")
        
        # Numerical inputs
        if numerical_cols:
            num_input = Input(shape=(len(numerical_cols),), name='numerical_input')
            all_inputs = cat_inputs + [num_input]
            all_features = cat_embeddings + [num_input]
        else:
            all_inputs = cat_inputs
            all_features = cat_embeddings
        
        # Combine all features
        if len(all_features) > 1:
            x = Concatenate()(all_features)
        else:
            x = all_features[0]
        
        # Simple network for learning embeddings
        x = Dense(64, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(32, activation='relu')(x)
        output = Dense(1, activation='linear')(x)
        
        self.model = Model(inputs=all_inputs, outputs=output)
        self.model.compile(optimizer=Adam(0.001), loss='mse', metrics=['mae'])
        
        # Prepare training data
        X_cat = [df_train[f'{col}_encoded'].values for col in categorical_cols]
        if numerical_cols:
            X_num = df_train[numerical_cols].values
            X_all = X_cat + [X_num]
        else:
            X_all = X_cat
        
        y = df_train[target_col].values
        
        # Train
        print("\nTraining entity embeddings...")
        self.model.fit(
            X_all, y,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            verbose=verbose
        )
        
        # Extract learned embeddings
        for col in categorical_cols:
            emb_layer = self.model.get_layer(f'{col}_embedding')
            self.embeddings[col] = emb_layer.get_weights()[0]
        
        return self
    
    def transform(self, df, categorical_cols):
        """
        Transform categorical columns to their embedding representations.
        
        Args:
            df: DataFrame to transform
            categorical_cols: List of categorical columns to transform
        
        Returns:
            DataFrame with embedding columns
        """
        result = df.copy()
        
        for col in categorical_cols:
            if col not in self.encoders:
                raise ValueError(f"Column {col} was not fitted")
            
            le = self.encoders[col]
            emb = self.embeddings[col]
            
            # Handle unseen categories
            encoded = []
            for val in df[col].astype(str):
                if val in le.classes_:
                    encoded.append(le.transform([val])[0])
                else:
                    encoded.append(0)  # Default to first category
            encoded = np.array(encoded)
            
            # Get embeddings
            emb_values = emb[encoded]
            
            # Add embedding columns
            for i in range(emb_values.shape[1]):
                result[f'{col}_emb_{i}'] = emb_values[:, i]
        
        return result
    
    def save(self, path):
        """Save embeddings to file"""
        with open(path, 'wb') as f:
            pickle.dump({
                'encoders': self.encoders,
                'embeddings': self.embeddings,
                'embedding_dims': self.embedding_dims
            }, f)
        print(f"Saved embeddings to {path}")
    
    @classmethod
    def load(cls, path):
        """Load embeddings from file"""
        with open(path, 'rb') as f:
            data = pickle.load(f)
        
        embedder = cls(embedding_dims=data['embedding_dims'])
        embedder.encoders = data['encoders']
        embedder.embeddings = data['embeddings']
        return embedder


# Train Entity Embeddings (Optional - can be slow)
TRAIN_EMBEDDINGS = True  # Set to True to train embeddings

if TRAIN_EMBEDDINGS and df_process is not None:
    print("Training Entity Embeddings...")
    
    # Prepare trip-level data for embedding training
    trip_data = df_train.groupby('trip_key').agg({
        'route_id': 'first',
        'direction_id': 'first',
        'region_id': 'first',
        'arrival_delay_agg': 'mean',  # Target: average delay per trip
        'hour': 'first',
        'day_of_week': 'first',
        'is_rush_hour': 'first'
    }).reset_index()
    
    # Train embedder
    embedder = EntityEmbedder(embedding_dims={
        'route_id': 8,      # Reduce route to 8D
        'direction_id': 2,  # Binary -> 2D
        'region_id': 4      # Region -> 4D
    })
    
    embedder.fit(
        trip_data,
        categorical_cols=['route_id', 'direction_id', 'region_id'],
        target_col='arrival_delay_agg',
        numerical_cols=['hour', 'day_of_week', 'is_rush_hour'],
        epochs=10,
        verbose=1
    )
    
    # Save embeddings for future use
    embedder.save('data/processed_data/entity_embeddings.pkl')
    
    print("\nEmbedding shapes:")
    for col, emb in embedder.embeddings.items():
        print(f"  {col}: {emb.shape}")
else:
    print("Entity Embeddings training skipped. Set TRAIN_EMBEDDINGS=True to train.")

Training Entity Embeddings...
  route_id: 222 categories -> 8D embedding
  direction_id: 2 categories -> 2D embedding
  region_id: 21 categories -> 4D embedding

Training entity embeddings...
Epoch 1/10


I0000 00:00:1767000960.659281  430231 service.cc:145] XLA service 0xffc3780 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1767000960.659335  430231 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Ti, Compute Capability 8.6


[1m 23/970[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 7ms/step - loss: 65688.9531 - mae: 162.5432

I0000 00:00:1767000962.756538  430231 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - loss: 50874.6719 - mae: 148.6498 - val_loss: 49150.5859 - val_mae: 147.8472
Epoch 2/10
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 47473.1445 - mae: 144.9365 - val_loss: 48924.0664 - val_mae: 147.4582
Epoch 3/10
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 47182.0430 - mae: 144.3790 - val_loss: 48483.4688 - val_mae: 146.5894
Epoch 4/10
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 46905.0000 - mae: 143.9731 - val_loss: 48237.5625 - val_mae: 145.3760
Epoch 5/10
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 46712.8633 - mae: 143.6628 - val_loss: 48295.0273 - val_mae: 145.9314
Epoch 6/10
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 46639.8359 - mae: 143.6346 - val_loss: 48130.2852 - val_mae: 144.0833
Epoch 7/10
[1m970/970[0m

In [None]:
# Model Comparison Table and Save Results
utils.display_and_save_results(evaluation_results, 'data/evaluation_results_xgboost.json')