In [200]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [201]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [202]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
print(X_train.columns)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','DOWNTIME','minute','clouds_all'])

Index(['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME',
       'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2',
       'TIME_TO_NIGHT_SHOW', 'WAIT_TIME_IN_2H', 'temp', 'dew_point',
       'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h',
       'snow_1h', 'clouds_all'],
      dtype='object')
                 DATETIME ENTITY_DESCRIPTION_SHORT  ADJUST_CAPACITY  DOWNTIME  \
5927  2018-10-01 12:15:00               Water Ride            224.5         0   
8149  2018-10-01 12:15:00           Flying Coaster            756.0         0   
12003 2018-10-01 12:15:00              Pirate Ship            153.0         0   
13978 2018-10-01 12:30:00              Pirate Ship            153.0         0   
23029 2018-10-01 12:30:00               Water Ride            224.5         0   

       CURRENT_WAIT_TIME  TIME_TO_PARADE_1  TIME_TO_PARADE_2  \
5927                  20             315.0              -5.0   
8149                  10             315.0   

In [203]:
optimized_estimators = 450
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=0.1, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

importances = model.get_feature_importance()
feature_names = X_train.columns

for name, imp in zip(feature_names, importances):
    print(f"{name}: {imp:.4f}")


0:	learn: 13.6939284	total: 7.22ms	remaining: 3.24s
1:	learn: 13.1832137	total: 13.7ms	remaining: 3.07s
2:	learn: 12.7519195	total: 18.4ms	remaining: 2.73s
3:	learn: 12.3671556	total: 22.8ms	remaining: 2.54s
4:	learn: 12.0347290	total: 29.2ms	remaining: 2.6s
5:	learn: 11.7486297	total: 35.1ms	remaining: 2.6s
6:	learn: 11.5052395	total: 40.5ms	remaining: 2.56s
7:	learn: 11.2905067	total: 46.8ms	remaining: 2.59s
8:	learn: 11.1044486	total: 51.3ms	remaining: 2.51s
9:	learn: 10.9360221	total: 55.7ms	remaining: 2.45s
10:	learn: 10.7969761	total: 62.8ms	remaining: 2.51s
11:	learn: 10.6736685	total: 70.8ms	remaining: 2.58s
12:	learn: 10.5604389	total: 78.7ms	remaining: 2.64s
13:	learn: 10.4625155	total: 87.5ms	remaining: 2.72s
14:	learn: 10.3762116	total: 93.4ms	remaining: 2.71s
15:	learn: 10.2974851	total: 99.8ms	remaining: 2.71s
16:	learn: 10.2214072	total: 104ms	remaining: 2.65s
17:	learn: 10.1589933	total: 110ms	remaining: 2.63s
18:	learn: 10.1005729	total: 115ms	remaining: 2.6s
19:	learn

In [204]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['DOWNTIME','minute','clouds_all'])

In [205]:
y_pred_final = model.predict(X_val)

In [206]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [207]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [208]:
def generate_csv(input, y_pred, key='Validation', filename='../output/random_forest_improved_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [209]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [None]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
X_train['rain_1h__6'] = X_train['rain_1h'].shift(-6)
X_train['rain_1h__3'] = X_train['rain_1h'].shift(-3)

X_train['snow_1h__6'] = X_train['snow_1h'].shift(-6)
X_train['snow_1h__3'] = X_train['snow_1h'].shift(-3)

print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','clouds_all', 'humidity'])

                 DATETIME  ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  \
5927  2018-10-01 12:15:00            224.5         0                 20   
8149  2018-10-01 12:15:00            756.0         0                 10   
12003 2018-10-01 12:15:00            153.0         0                 25   
13978 2018-10-01 12:30:00            153.0         0                 25   
23029 2018-10-01 12:30:00            224.5         0                 20   

       TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  \
5927              315.0              -5.0               465.0   
8149              315.0              -5.0               465.0   
12003             315.0              -5.0               465.0   
13978             300.0             -20.0               450.0   
23029             300.0             -20.0               450.0   

       WAIT_TIME_IN_2H   temp  dew_point  ...  \
5927              20.0  13.01       5.43  ...   
8149              10.0  13.01       5.43  ...   
12003      

In [211]:
optimized_estimators = 1900
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=0.009, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

-7.6567120632439565 0.12498849982052476
                Model Evaluation Metrics:
Mean Absolute Error (MAE): 4.98972291063928
Root Mean Squared Error (RMSE): 6.624667358649011
R-squared (R2 ): 0.7848468402124719


In [212]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['clouds_all','humidity'])

In [213]:
y_pred_final_xgb = model.predict(X_val)

In [214]:
import pandas as pd
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [215]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [216]:
def generate_csv(input, y_pred, key='Validation', filename='../output/random_forest_improved_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [217]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [218]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
X_train['rain_1h__6'] = X_train['rain_1h'].shift(-6)
X_train['rain_1h__3'] = X_train['rain_1h'].shift(-3)

X_train['snow_1h__6'] = X_train['snow_1h'].shift(-6)
X_train['snow_1h__3'] = X_train['snow_1h'].shift(-3)

print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','pressure', 'minute','dew_point'])

                 DATETIME  ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  \
5927  2018-10-01 12:15:00            224.5         0                 20   
8149  2018-10-01 12:15:00            756.0         0                 10   
12003 2018-10-01 12:15:00            153.0         0                 25   
13978 2018-10-01 12:30:00            153.0         0                 25   
23029 2018-10-01 12:30:00            224.5         0                 20   

       TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  \
5927              315.0              -5.0               465.0   
8149              315.0              -5.0               465.0   
12003             315.0              -5.0               465.0   
13978             300.0             -20.0               450.0   
23029             300.0             -20.0               450.0   

       WAIT_TIME_IN_2H   temp  dew_point  ...  \
5927              20.0  13.01       5.43  ...   
8149              10.0  13.01       5.43  ...   
12003      

In [219]:
optimized_estimators = 4500
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = LGBMRegressor(n_estimators=optimized_estimators, learning_rate=0.004, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3511
[LightGBM] [Info] Number of data points in the train set: 29614, number of used features: 23
[LightGBM] [Info] Start training from score 23.677484
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3502
[LightGBM] [Info] Number of data points in the train set: 29614, number of used features: 23
[LightGBM] [Info] Start training from score 23.534646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3508
[LightGBM] [Info] Number of data points in the train set: 29614, number of used features: 23
[LightGBM] [Info] Start t

In [222]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['pressure', 'minute','dew_point'])

In [223]:
y_pred_final_lgbm = model.predict(X_val)

In [224]:
y_pred = (y_pred_final + y_pred_final_xgb+ y_pred_final_lgbm) / 3
generate_csv(validation_file, y_pred, key='Validation', filename='../output/ensemble_catboost_xgboost_predictions.csv')

Predictions saved to ../output/ensemble_catboost_xgboost_predictions.csv
