In [133]:
import pandas as pd
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [134]:
def imputar_outliers_iqr(df, columnas):
    df_imputado = df.copy()
    for col in columnas:
        Q1 = df_imputado[col].quantile(0.25)
        Q3 = df_imputado[col].quantile(0.75)
        IQR = Q3 - Q1
        limite_inferior = Q1 - 1.5 * IQR
        limite_superior = Q3 + 1.5 * IQR

        # Identificar outliers
        outliers_inferiores = df_imputado[col] < limite_inferior
        outliers_superiores = df_imputado[col] > limite_superior

        # Imputar con la mediana (o con los límites)
        mediana = df_imputado[col].median()
        df_imputado.loc[outliers_inferiores, col] = mediana
        df_imputado.loc[outliers_superiores, col] = mediana

        # Alternativa: imputar con los límites
        # df_imputado.loc[outliers_inferiores, col] = limite_inferior
        # df_imputado.loc[outliers_superiores, col] = limite_superior

    return df_imputado

In [135]:
def generate_csv(input, y_pred, key='Validation', filename='../output/random_forest_improved_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [136]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

train_file = imputar_outliers_iqr(train_file, ['WAIT_TIME_IN_2H','CURRENT_WAIT_TIME'])
weather_file = imputar_outliers_iqr(weather_file, ['pressure', 'rain_1h'])

In [137]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H'])

In [138]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=[])

In [139]:
y_pred = []

for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))


In [140]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','DOWNTIME'])

In [141]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=[ 'DOWNTIME'])

In [142]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

In [143]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','minute'])

In [144]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=['minute'])

In [145]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

0:	learn: 12.8801619	total: 10.8ms	remaining: 1.06s
1:	learn: 12.2452733	total: 19.3ms	remaining: 948ms
2:	learn: 11.7345623	total: 31.4ms	remaining: 1.01s
3:	learn: 11.3389944	total: 42.3ms	remaining: 1.01s
4:	learn: 11.0317819	total: 49.7ms	remaining: 944ms
5:	learn: 10.7824674	total: 58.2ms	remaining: 911ms
6:	learn: 10.5689859	total: 65ms	remaining: 864ms
7:	learn: 10.3880381	total: 73.7ms	remaining: 848ms
8:	learn: 10.2601689	total: 83.5ms	remaining: 844ms
9:	learn: 10.1433848	total: 91.1ms	remaining: 820ms
10:	learn: 10.0498117	total: 99.7ms	remaining: 807ms
11:	learn: 9.9690740	total: 107ms	remaining: 785ms
12:	learn: 9.8938879	total: 113ms	remaining: 754ms
13:	learn: 9.8296203	total: 118ms	remaining: 724ms
14:	learn: 9.7802836	total: 125ms	remaining: 706ms
15:	learn: 9.7399827	total: 130ms	remaining: 681ms
16:	learn: 9.6948608	total: 135ms	remaining: 659ms
17:	learn: 9.6504467	total: 142ms	remaining: 646ms
18:	learn: 9.6025494	total: 147ms	remaining: 628ms
19:	learn: 9.5679116	

In [None]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'])
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','TIME_TO_PARADE_1'])

In [None]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=['rain_1h','TIME_TO_PARADE_1'])

In [148]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

0:	learn: 12.8681279	total: 9.1ms	remaining: 901ms
1:	learn: 12.2225526	total: 17ms	remaining: 832ms
2:	learn: 11.7147773	total: 25.4ms	remaining: 820ms
3:	learn: 11.3169536	total: 34.3ms	remaining: 823ms
4:	learn: 10.9980700	total: 42ms	remaining: 798ms
5:	learn: 10.7504721	total: 51.3ms	remaining: 804ms
6:	learn: 10.5626414	total: 57.1ms	remaining: 759ms
7:	learn: 10.4072450	total: 64.3ms	remaining: 740ms
8:	learn: 10.2739159	total: 70.2ms	remaining: 710ms
9:	learn: 10.1606659	total: 76.9ms	remaining: 692ms
10:	learn: 10.0710012	total: 88ms	remaining: 712ms
11:	learn: 10.0018425	total: 101ms	remaining: 739ms
12:	learn: 9.9319194	total: 113ms	remaining: 755ms
13:	learn: 9.8740038	total: 125ms	remaining: 766ms
14:	learn: 9.8171399	total: 136ms	remaining: 773ms
15:	learn: 9.7729424	total: 150ms	remaining: 789ms
16:	learn: 9.7206244	total: 161ms	remaining: 788ms
17:	learn: 9.6885630	total: 171ms	remaining: 777ms
18:	learn: 9.6499413	total: 185ms	remaining: 787ms
19:	learn: 9.6169080	tota

In [149]:
y_pred_final = np.mean(y_pred, axis=0)
generate_csv(input=validation_file, y_pred=y_pred_final, key='Validation', filename='../output/xgboost_ensemble_predictions.csv')

Predictions saved to ../output/xgboost_ensemble_predictions.csv
