In [17]:
import pandas as pd
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [18]:
def imputar_outliers_iqr(df, columnas):
    df_imputado = df.copy()
    for col in columnas:
        Q1 = df_imputado[col].quantile(0.25)
        Q3 = df_imputado[col].quantile(0.75)
        IQR = Q3 - Q1
        limite_inferior = Q1 - 1.5 * IQR
        limite_superior = Q3 + 1.5 * IQR

        # Identificar outliers
        outliers_inferiores = df_imputado[col] < limite_inferior
        outliers_superiores = df_imputado[col] > limite_superior

        # Imputar con la mediana (o con los límites)
        mediana = df_imputado[col].median()
        df_imputado.loc[outliers_inferiores, col] = mediana
        df_imputado.loc[outliers_superiores, col] = mediana

        # Alternativa: imputar con los límites
        # df_imputado.loc[outliers_inferiores, col] = limite_inferior
        # df_imputado.loc[outliers_superiores, col] = limite_superior

    return df_imputado

In [19]:
def generate_csv(input, y_pred, key='Validation', filename='../output/random_forest_improved_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [20]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

train_file = imputar_outliers_iqr(train_file, ['WAIT_TIME_IN_2H','CURRENT_WAIT_TIME'])
weather_file = imputar_outliers_iqr(weather_file, ['pressure', 'rain_1h'])

In [21]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H'])

In [22]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=[])

In [23]:
y_pred = []

for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))


In [24]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','DOWNTIME'])

In [25]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=[ 'DOWNTIME'])

In [26]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

In [27]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'] + 1)
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','minute'])

In [28]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=['minute'])

In [29]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

0:	learn: 12.8719286	total: 8.14ms	remaining: 806ms
1:	learn: 12.2434070	total: 15.1ms	remaining: 740ms
2:	learn: 11.7329220	total: 26.4ms	remaining: 854ms
3:	learn: 11.3370694	total: 38.3ms	remaining: 919ms
4:	learn: 11.0308083	total: 49.8ms	remaining: 946ms
5:	learn: 10.7798045	total: 61ms	remaining: 955ms
6:	learn: 10.5666410	total: 76.5ms	remaining: 1.02s
7:	learn: 10.3860271	total: 88.3ms	remaining: 1.01s
8:	learn: 10.2580533	total: 99.4ms	remaining: 1s
9:	learn: 10.1412913	total: 108ms	remaining: 975ms
10:	learn: 10.0471865	total: 117ms	remaining: 949ms
11:	learn: 9.9715672	total: 124ms	remaining: 911ms
12:	learn: 9.8994746	total: 133ms	remaining: 890ms
13:	learn: 9.8406477	total: 139ms	remaining: 854ms
14:	learn: 9.7899864	total: 147ms	remaining: 831ms
15:	learn: 9.7519783	total: 153ms	remaining: 805ms
16:	learn: 9.7151512	total: 161ms	remaining: 787ms
17:	learn: 9.6699405	total: 168ms	remaining: 765ms
18:	learn: 9.6262842	total: 174ms	remaining: 743ms
19:	learn: 9.5866242	total

In [30]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values
    
y_train = (X_train['WAIT_TIME_IN_2H'])
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','TIME_TO_PARADE_1','rain_1h'])

In [31]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.select_dtypes(include=['number']).drop(columns=['rain_1h','TIME_TO_PARADE_1'])

In [32]:
for optimized_estimators, rate in zip(range(100, 3000, 300), np.arange(0.15, 0.06, -0.01)):
    model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=rate)
    model.fit(X_train, y_train)
    y_pred.append((model.predict(X_val)))

0:	learn: 12.8893473	total: 14.2ms	remaining: 1.4s
1:	learn: 12.2433923	total: 30.8ms	remaining: 1.51s
2:	learn: 11.7455550	total: 47.2ms	remaining: 1.53s
3:	learn: 11.3682603	total: 72.5ms	remaining: 1.74s
4:	learn: 11.0657072	total: 83.5ms	remaining: 1.59s
5:	learn: 10.8051403	total: 93.5ms	remaining: 1.47s
6:	learn: 10.6068785	total: 105ms	remaining: 1.39s
7:	learn: 10.4360937	total: 115ms	remaining: 1.32s
8:	learn: 10.3007033	total: 125ms	remaining: 1.26s
9:	learn: 10.1835998	total: 135ms	remaining: 1.21s
10:	learn: 10.0840234	total: 143ms	remaining: 1.16s
11:	learn: 10.0074966	total: 154ms	remaining: 1.13s
12:	learn: 9.9353214	total: 165ms	remaining: 1.1s
13:	learn: 9.8718418	total: 175ms	remaining: 1.07s
14:	learn: 9.8197054	total: 189ms	remaining: 1.07s
15:	learn: 9.7771274	total: 207ms	remaining: 1.09s
16:	learn: 9.7254664	total: 222ms	remaining: 1.08s
17:	learn: 9.6857394	total: 238ms	remaining: 1.09s
18:	learn: 9.6561994	total: 259ms	remaining: 1.1s
19:	learn: 9.6257867	total

In [33]:
y_pred_final = np.mean(y_pred, axis=0)
generate_csv(input=test_file, y_pred=y_pred_final, key='Final', filename='../outpu_final/xgboost_chatboost_ensemble_predictions.csv')

Predictions saved to ../outpu_final/xgboost_chatboost_ensemble_predictions.csv
