In [118]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [119]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [120]:
def generate_csv(input, y_pred, key='Validation', filename='../output/ann_model_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [121]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [122]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values


y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['clouds_all', 'humidity'])

In [123]:
caracteristicas_seleccionadas = X_train.columns.tolist()
X_train_aux = X_train.copy().drop(columns=['WAIT_TIME_IN_2H'])
scaler = StandardScaler()
df_escalado = scaler.fit_transform(X_train_aux)
X_train = X_train.sort_index()

In [124]:
def crear_dataset_multivariante(dataset, lookback=1):
    X = []
    for i in range(lookback, len(dataset)):
        X.append(dataset[i-lookback:i, :]) # Tomamos TODAS las columnas para la ventana
    return np.array(X)


In [125]:
lookback = 30

In [126]:
target_col_index = caracteristicas_seleccionadas.index('WAIT_TIME_IN_2H')
X = crear_dataset_multivariante(df_escalado, lookback)
print(X.shape)
y = y_train[lookback:].values

(36988, 30, 20)


In [127]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [128]:
n_features = X.shape[2] # Número de características (columnas)

input_layer = Input(shape=(lookback, n_features))
lstm1 = LSTM(50, return_sequences=True)(input_layer)
drop1 = Dropout(0.2)(lstm1)
lstm2 = LSTM(50, return_sequences=False)(drop1)
drop2 = Dropout(0.2)(lstm2)
output_layer = Dense(1)(drop2)

modelo_lstm_multi = Model(inputs=input_layer, outputs=output_layer)
modelo_lstm_multi.compile(optimizer='adam', loss='mse')

In [129]:
early_stop = EarlyStopping(monitor='val_loss', patience=10)
history = modelo_lstm_multi.fit(X_train, y_train,
                                epochs=100,
                                batch_size=32,
                                validation_data=(X_test, y_test),
                                callbacks=[early_stop],
                                verbose=1)

Epoch 1/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - loss: 223.3268 - val_loss: 576.9011
Epoch 2/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - loss: 130.9437 - val_loss: 635.0412
Epoch 3/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 16ms/step - loss: 118.1522 - val_loss: 341.7314
Epoch 4/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 17ms/step - loss: 111.6893 - val_loss: 366.4337
Epoch 5/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step - loss: 107.5115 - val_loss: 314.1693
Epoch 6/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 17ms/step - loss: 105.1733 - val_loss: 298.8990
Epoch 7/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - loss: 101.8243 - val_loss: 267.6361
Epoch 8/100
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - loss: 98.5963 - val_

In [135]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')
X_val_aux = X_val.copy()

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values


X_val = X_val.select_dtypes(include=['number']).drop(columns=['clouds_all', 'humidity'])

In [137]:
print(X_val_aux.head())
df_escalado = scaler.transform(X_val)
X = crear_dataset_multivariante(df_escalado, lookback)

             DATETIME ENTITY_DESCRIPTION_SHORT  ADJUST_CAPACITY  DOWNTIME  \
0 2019-11-23 10:45:00               Water Ride            247.0         0   
1 2022-01-03 16:45:00              Pirate Ship            153.0         0   
2 2021-12-04 15:30:00              Pirate Ship            255.0         0   
3 2020-02-05 13:15:00               Water Ride            247.0         0   
4 2022-05-13 15:15:00           Flying Coaster            756.0         0   

   CURRENT_WAIT_TIME  TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  
0                 20             375.0              75.0               675.0  
1                 45               NaN               NaN                 NaN  
2                 40               NaN               NaN                 NaN  
3                 15             225.0               NaN               345.0  
4                 35             135.0               NaN               465.0  


In [138]:
X = crear_dataset_multivariante(df_escalado, lookback)
y_pred = modelo_lstm_multi.predict(X)

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [140]:
X_val_aux['y'] = [number[0] for number in y_pred.tolist()] + [np.mean(y_train)]*lookback
X_val_aux = X_val_aux.sort_index()
print(X_val_aux.columns)
generate_csv(X_val_aux, X_val_aux['y'])

Index(['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME',
       'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2',
       'TIME_TO_NIGHT_SHOW', 'y'],
      dtype='object')
Predictions saved to ../output/ann_model_predictions.csv
