In [172]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetRegressor

In [173]:
def generate_csv(input, y_pred, key='Validation', filename='../output/ann_model_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [174]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [175]:
np.random.seed(0)

In [176]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(24*60)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(24*60)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values


y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.drop(columns=['WAIT_TIME_IN_2H', 'DATETIME'])

In [177]:
cat_cols = ["ENTITY_DESCRIPTION_SHORT"]
date_cols = ['year','month','day','hour','minute']
num_cols = [c for c in X_train.columns if c not in cat_cols]

# --- label-encode categorías (TabNet espera ints para categorías) ---
label_encoders = {}
X_encoded = X_train.copy()
for c in cat_cols:
    le = LabelEncoder()
    X_encoded[c] = le.fit_transform(X_encoded[c].astype(str))
    label_encoders[c] = le

for c in date_cols:
    X_encoded[c] = X_encoded[c].astype(str).astype(int)

In [178]:
scaler = StandardScaler()
X_encoded[num_cols] = scaler.fit_transform(X_encoded[num_cols])

X_np = X_encoded.values.astype(np.float32)
y_np = y_train.values.reshape(-1, 1).astype(np.float32) 

In [181]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_np, y_np, test_size=0.2, random_state=42
)

print(X_train.shape, y_train.shape)

model = TabNetRegressor(
    n_d=8, n_a=8,           # tamaño de las representaciones (prueba 8-64)
    n_steps=3,              # número de pasos de atención
    gamma=1.5,
    lambda_sparse=1e-3,
    device_name="cuda" if __import__("torch").cuda.is_available() else "cpu"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_name=['valid'],
    eval_metric=['rmse'],
    max_epochs=200,
    patience=20,
    batch_size=256,
    virtual_batch_size=64,
    num_workers=0,
    drop_last=False
)

(29614, 21) (29614, 1)




epoch 0  | loss: 287.12644| valid_rmse: 11.55064|  0:00:02s
epoch 1  | loss: 123.33375| valid_rmse: 10.83406|  0:00:06s
epoch 2  | loss: 113.23714| valid_rmse: 10.39795|  0:00:08s
epoch 3  | loss: 107.84208| valid_rmse: 10.32023|  0:00:12s
epoch 4  | loss: 104.19016| valid_rmse: 10.20113|  0:00:14s
epoch 5  | loss: 102.34363| valid_rmse: 10.0644 |  0:00:17s
epoch 6  | loss: 100.14988| valid_rmse: 9.93748 |  0:00:20s
epoch 7  | loss: 98.6363 | valid_rmse: 9.9767  |  0:00:23s
epoch 8  | loss: 98.27374| valid_rmse: 9.93152 |  0:00:27s
epoch 9  | loss: 96.65511| valid_rmse: 9.80622 |  0:00:30s
epoch 10 | loss: 96.27194| valid_rmse: 9.7925  |  0:00:33s
epoch 11 | loss: 95.84076| valid_rmse: 9.99812 |  0:00:36s
epoch 12 | loss: 94.37804| valid_rmse: 9.69271 |  0:00:39s
epoch 13 | loss: 93.11752| valid_rmse: 9.74372 |  0:00:42s
epoch 14 | loss: 92.20902| valid_rmse: 9.72388 |  0:00:45s
epoch 15 | loss: 92.62398| valid_rmse: 9.63018 |  0:00:48s
epoch 16 | loss: 91.74344| valid_rmse: 9.60534 | 



In [135]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')
X_val_aux = X_val.copy()

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values


X_val = X_val.select_dtypes(include=['number']).drop(columns=['clouds_all', 'humidity'])

In [140]:
X_val_aux['y'] = [number[0] for number in y_pred.tolist()] + [np.mean(y_train)]*lookback
X_val_aux = X_val_aux.sort_index()
print(X_val_aux.columns)
generate_csv(X_val_aux, X_val_aux['y'])

Index(['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME',
       'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2',
       'TIME_TO_NIGHT_SHOW', 'y'],
      dtype='object')
Predictions saved to ../output/ann_model_predictions.csv
