# 05. Model LSTM
LSTM model implementation.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, BatchNormalization, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
import utils

# Load Split Data using shared function
df_train, df_valid, df_test, df_process, split_info = utils.load_split_data_with_combined()

if df_process is None:
    raise RuntimeError("Data not found. Please run 02_process_data.ipynb first.")

In [None]:
# Hyperparameters
N_PAST_TRIPS = 5  # Try: 3, 5, 7, 10 to find optimal sequence length
print(f"Sequence Length (n_past_trips): {N_PAST_TRIPS}")

# Create Sequences using shared function
n_past_trips = N_PAST_TRIPS
data = utils.prepare_model_data(df_train, df_test, df_process, n_past_trips=n_past_trips)

# Extract variables
X_delays_train, X_features_train, X_agg_train, y_train = \
    data['X_delays_train'], data['X_features_train'], data['X_agg_train'], data['y_train']
X_delays_test, X_features_test, X_agg_test, y_test = \
    data['X_delays_test'], data['X_features_test'], data['X_agg_test'], data['y_test']
n_stops = data['n_stops']
stops_dict = data['stops_dict']

In [None]:
# Scaling
if df_process is not None:
    delay_scaler = StandardScaler()
    X_delays_train_scaled = delay_scaler.fit_transform(X_delays_train.reshape(-1, n_stops)).reshape(X_delays_train.shape)
    X_delays_test_scaled = delay_scaler.transform(X_delays_test.reshape(-1, n_stops)).reshape(X_delays_test.shape)

    y_train_scaled = delay_scaler.transform(y_train)
    y_test_scaled = delay_scaler.transform(y_test)

    feature_scaler = StandardScaler()
    X_combined_train = np.concatenate([X_features_train, X_agg_train], axis=1)
    X_combined_test = np.concatenate([X_features_test, X_agg_test], axis=1)

    X_combined_train_scaled = feature_scaler.fit_transform(X_combined_train)
    X_combined_test_scaled = feature_scaler.transform(X_combined_test)

In [None]:
from tensorflow.keras.layers import Layer, Attention, MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
import tensorflow.keras.backend as K

# Custom Attention Layer
class TemporalAttention(Layer):
    """
    Attention mechanism for LSTM sequences.
    
    Learns which past time steps are most important for predicting current delays.
    Returns attention weights for interpretability.
    """
    
    def __init__(self, **kwargs):
        super(TemporalAttention, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], input_shape[-1]),
            initializer='glorot_uniform',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True
        )
        self.u = self.add_weight(
            name='attention_context',
            shape=(input_shape[-1],),
            initializer='glorot_uniform',
            trainable=True
        )
        super(TemporalAttention, self).build(input_shape)
    
    def call(self, x):
        # x shape: (batch, time_steps, features)
        # Compute attention scores
        uit = K.tanh(K.dot(x, self.W) + self.b)  # (batch, time_steps, features)
        ait = K.dot(uit, K.expand_dims(self.u))  # (batch, time_steps, 1)
        ait = K.squeeze(ait, -1)  # (batch, time_steps)
        
        # Softmax to get attention weights
        ait = K.softmax(ait)  # (batch, time_steps)
        
        # Apply attention weights
        weighted = x * K.expand_dims(ait)  # (batch, time_steps, features)
        output = K.sum(weighted, axis=1)  # (batch, features)
        
        return output, ait
    
    def compute_output_shape(self, input_shape):
        return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[1])]


def build_lstm_model_basic(n_past_trips, n_stops, n_features):
    """Basic LSTM model (original)"""
    delay_input = Input(shape=(n_past_trips, n_stops), name='delay_input')
    x = LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001))(delay_input)
    x = Dropout(0.3)(x)
    x = LSTM(32, return_sequences=False, kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.2)(x)

    feature_input = Input(shape=(n_features,), name='feature_input')
    f = Dense(32, activation='relu')(feature_input)
    f = Dropout(0.2)(f)
    f = Dense(16, activation='relu')(f)

    combined = Concatenate()([x, f])
    combined = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(combined)
    combined = BatchNormalization()(combined)
    combined = Dropout(0.3)(combined)
    combined = Dense(32, activation='relu')(combined)

    output = Dense(n_stops, activation='linear')(combined)

    model = Model(inputs=[delay_input, feature_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model


def build_lstm_model_with_attention(n_past_trips, n_stops, n_features):
    """
    LSTM with Temporal Attention.
    
    Attention mechanism allows the model to learn which past trips are most
    important for predicting the current delay. This provides:
    1. Better performance by focusing on relevant past data
    2. Interpretability - we can see attention weights
    """
    # Delay sequence input
    delay_input = Input(shape=(n_past_trips, n_stops), name='delay_input')
    
    # LSTM encoding
    x = LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001))(delay_input)
    x = Dropout(0.3)(x)
    x = LSTM(32, return_sequences=True, kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.2)(x)
    
    # Apply temporal attention
    attention_layer = TemporalAttention(name='temporal_attention')
    attended, attention_weights = attention_layer(x)
    
    # Feature input
    feature_input = Input(shape=(n_features,), name='feature_input')
    f = Dense(32, activation='relu')(feature_input)
    f = Dropout(0.2)(f)
    f = Dense(16, activation='relu')(f)
    
    # Combine attended sequence with features
    combined = Concatenate()([attended, f])
    combined = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(combined)
    combined = BatchNormalization()(combined)
    combined = Dropout(0.3)(combined)
    combined = Dense(32, activation='relu')(combined)
    
    output = Dense(n_stops, activation='linear')(combined)
    
    # Main model
    model = Model(inputs=[delay_input, feature_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    # Attention model for interpretation
    attention_model = Model(inputs=[delay_input, feature_input], 
                           outputs=[output, attention_weights])
    
    return model, attention_model


def build_lstm_model_with_multihead_attention(n_past_trips, n_stops, n_features, n_heads=4):
    """
    LSTM with Multi-Head Attention (Transformer-style).
    
    Uses multiple attention heads to capture different aspects of temporal patterns.
    """
    # Delay sequence input
    delay_input = Input(shape=(n_past_trips, n_stops), name='delay_input')
    
    # Project to attention dimension
    x = Dense(64)(delay_input)
    
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        num_heads=n_heads, 
        key_dim=16,
        name='multihead_attention'
    )(x, x)
    
    # Add & Norm
    x = LayerNormalization()(x + attention_output)
    
    # LSTM for sequential processing
    x = LSTM(32, return_sequences=False, kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.2)(x)
    
    # Feature input
    feature_input = Input(shape=(n_features,), name='feature_input')
    f = Dense(32, activation='relu')(feature_input)
    f = Dropout(0.2)(f)
    f = Dense(16, activation='relu')(f)
    
    # Combine
    combined = Concatenate()([x, f])
    combined = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(combined)
    combined = BatchNormalization()(combined)
    combined = Dropout(0.3)(combined)
    combined = Dense(32, activation='relu')(combined)
    
    output = Dense(n_stops, activation='linear')(combined)
    
    model = Model(inputs=[delay_input, feature_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    return model


# Model Selection
MODEL_TYPE = 'attention'  # Options: 'basic', 'attention', 'multihead'

if df_process is not None:
    n_features = X_combined_train.shape[1]
    
    print(f"Building {MODEL_TYPE.upper()} LSTM model...")
    print(f"  - Sequence length: {n_past_trips}")
    print(f"  - Stops per route: {n_stops}")
    print(f"  - Number of features: {n_features}")
    
    if MODEL_TYPE == 'basic':
        model = build_lstm_model_basic(n_past_trips, n_stops, n_features)
        attention_model = None
    elif MODEL_TYPE == 'attention':
        model, attention_model = build_lstm_model_with_attention(n_past_trips, n_stops, n_features)
    elif MODEL_TYPE == 'multihead':
        model = build_lstm_model_with_multihead_attention(n_past_trips, n_stops, n_features, n_heads=4)
        attention_model = None
    
    model.summary()

In [None]:
# Train
evaluation_results = []

history = model.fit(
    [X_delays_train_scaled, X_combined_train_scaled],
    y_train_scaled,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    verbose=1
)

# Evaluate
y_pred_scaled = model.predict([X_delays_test_scaled, X_combined_test_scaled])
y_pred = delay_scaler.inverse_transform(y_pred_scaled)

result_lstm = utils.evaluate_model(
    y_test, y_pred,
    model_name=f"LSTM ({MODEL_TYPE.capitalize()})",
    config={
        "model_type": MODEL_TYPE,
        "n_past_trips": n_past_trips,
        "epochs": 20,
        "batch_size": 64,
        "scaler": "StandardScaler"
    }
)
evaluation_results.append(result_lstm)
print(result_lstm.summary())

# Attention Weights Visualization

Attentionモデルを使用している場合、どの過去の時点が現在の遅延予測に寄与しているかを可視化できます。

In [None]:
import matplotlib.pyplot as plt

if MODEL_TYPE == 'attention' and attention_model is not None:
    # Get attention weights for test samples
    _, attention_weights = attention_model.predict(
        [X_delays_test_scaled[:1000], X_combined_test_scaled[:1000]]
    )
    
    # Average attention weights across samples
    avg_attention = attention_weights.mean(axis=0)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar plot of average attention weights
    time_labels = [f't-{n_past_trips-i}' for i in range(n_past_trips)]
    colors = plt.cm.Blues(np.linspace(0.3, 0.9, n_past_trips))
    
    axes[0].bar(time_labels, avg_attention, color=colors, edgecolor='white')
    axes[0].set_xlabel('Past Trip (Time Step)')
    axes[0].set_ylabel('Average Attention Weight')
    axes[0].set_title('Temporal Attention Weights\n(Which past trips influence the prediction?)')
    
    for i, (label, weight) in enumerate(zip(time_labels, avg_attention)):
        axes[0].text(i, weight + 0.01, f'{weight:.2%}', ha='center', fontsize=10)
    
    # Heatmap of attention weights for first 20 samples
    im = axes[1].imshow(attention_weights[:20], aspect='auto', cmap='Blues')
    axes[1].set_xlabel('Time Step')
    axes[1].set_ylabel('Sample Index')
    axes[1].set_title('Attention Weights Heatmap (First 20 Samples)')
    axes[1].set_xticks(range(n_past_trips))
    axes[1].set_xticklabels(time_labels)
    plt.colorbar(im, ax=axes[1], label='Attention Weight')
    
    plt.tight_layout()
    plt.savefig('data/processed_data/attention_weights.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nAttention Weight Statistics:")
    print(f"  Most attended time step: {time_labels[np.argmax(avg_attention)]} ({avg_attention.max():.2%})")
    print(f"  Least attended time step: {time_labels[np.argmin(avg_attention)]} ({avg_attention.min():.2%})")
    
    # Interpretation
    print("\nInterpretation:")
    if np.argmax(avg_attention) == n_past_trips - 1:
        print("  -> The model focuses most on the most recent trip (t-1).")
        print("     This suggests short-term patterns are most predictive.")
    elif np.argmax(avg_attention) < n_past_trips // 2:
        print("  -> The model focuses more on older trips.")
        print("     This may indicate longer-term patterns or regular schedules.")
    else:
        print("  -> The model uses a balanced mix of recent and older information.")
else:
    print("Attention visualization is only available when MODEL_TYPE='attention'")

# Sequence Length Experiment

シーケンス長（過去何便を見るか）を変えて実験し、最適な値を見つけます。

In [None]:
# Sequence Length Experiment (Optional - can be time-consuming)
RUN_SEQUENCE_EXPERIMENT = False  # Set to True to run experiment

if RUN_SEQUENCE_EXPERIMENT and df_process is not None:
    sequence_lengths = [3, 5, 7, 10]
    results = []
    
    print("Running Sequence Length Experiment...")
    print("=" * 50)
    
    for seq_len in sequence_lengths:
        print(f"\n--- Sequence Length: {seq_len} ---")
        
        # Create sequences with current length
        X_del_train, X_feat_train, X_agg_train, y_tr, _, _, n_st = utils.create_trip_based_sequences_multi_route(
            df_train, seq_len, stops_dict=stops_dict
        )
        X_del_test, X_feat_test, X_agg_test, y_te, _, _, _ = utils.create_trip_based_sequences_multi_route(
            df_test, seq_len, stops_dict=stops_dict
        )
        
        # Scale
        d_scaler = StandardScaler()
        X_del_train_sc = d_scaler.fit_transform(X_del_train.reshape(-1, n_st)).reshape(X_del_train.shape)
        X_del_test_sc = d_scaler.transform(X_del_test.reshape(-1, n_st)).reshape(X_del_test.shape)
        y_tr_sc = d_scaler.transform(y_tr)
        
        f_scaler = StandardScaler()
        X_comb_train = np.concatenate([X_feat_train, X_agg_train], axis=1)
        X_comb_test = np.concatenate([X_feat_test, X_agg_test], axis=1)
        X_comb_train_sc = f_scaler.fit_transform(X_comb_train)
        X_comb_test_sc = f_scaler.transform(X_comb_test)
        
        # Build and train model (basic LSTM for speed)
        n_feat = X_comb_train.shape[1]
        exp_model = build_lstm_model_basic(seq_len, n_st, n_feat)
        
        exp_model.fit(
            [X_del_train_sc, X_comb_train_sc],
            y_tr_sc,
            validation_split=0.2,
            epochs=10,  # Reduced for speed
            batch_size=64,
            verbose=0
        )
        
        # Evaluate
        y_pred_sc = exp_model.predict([X_del_test_sc, X_comb_test_sc])
        y_pred_exp = d_scaler.inverse_transform(y_pred_sc)
        
        mae = mean_absolute_error(y_te.flatten(), y_pred_exp.flatten())
        r2 = r2_score(y_te.flatten(), y_pred_exp.flatten())
        
        results.append({
            'sequence_length': seq_len,
            'mae': mae,
            'r2': r2,
            'train_samples': len(y_tr),
            'test_samples': len(y_te)
        })
        
        print(f"  MAE: {mae:.2f}s, R2: {r2:.4f}")
        
        # Clean up
        del exp_model, X_del_train, X_del_test, y_tr, y_te
        import gc
        gc.collect()
    
    # Summary
    results_df = pd.DataFrame(results)
    print("\n" + "=" * 50)
    print("Sequence Length Experiment Results:")
    print(results_df.to_string(index=False))
    
    best_idx = results_df['mae'].idxmin()
    print(f"\nBest sequence length: {results_df.loc[best_idx, 'sequence_length']} (MAE: {results_df.loc[best_idx, 'mae']:.2f}s)")
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(results_df['sequence_length'], results_df['mae'], 'bo-', markersize=10, linewidth=2)
    ax.set_xlabel('Sequence Length (n_past_trips)')
    ax.set_ylabel('MAE (seconds)')
    ax.set_title('Sequence Length vs. Prediction Error')
    ax.set_xticks(sequence_lengths)
    ax.grid(True, alpha=0.3)
    
    for _, row in results_df.iterrows():
        ax.annotate(f"{row['mae']:.1f}s", 
                   (row['sequence_length'], row['mae']),
                   textcoords="offset points", xytext=(0, 10), ha='center')
    
    plt.tight_layout()
    plt.savefig('data/processed_data/sequence_length_experiment.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("Sequence length experiment skipped. Set RUN_SEQUENCE_EXPERIMENT=True to run.")

In [None]:
# Model Comparison Table and Save Results
utils.display_and_save_results(evaluation_results, 'data/evaluation_results_lstm.json')