In [237]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [238]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [239]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
print(X_train.columns)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','DOWNTIME','minute','clouds_all'])

Index(['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME',
       'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2',
       'TIME_TO_NIGHT_SHOW', 'WAIT_TIME_IN_2H', 'temp', 'dew_point',
       'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h',
       'snow_1h', 'clouds_all'],
      dtype='object')
                 DATETIME ENTITY_DESCRIPTION_SHORT  ADJUST_CAPACITY  DOWNTIME  \
5927  2018-10-01 12:15:00               Water Ride            224.5         0   
8149  2018-10-01 12:15:00           Flying Coaster            756.0         0   
12003 2018-10-01 12:15:00              Pirate Ship            153.0         0   
13978 2018-10-01 12:30:00              Pirate Ship            153.0         0   
23029 2018-10-01 12:30:00               Water Ride            224.5         0   

       CURRENT_WAIT_TIME  TIME_TO_PARADE_1  TIME_TO_PARADE_2  \
5927                  20             315.0              -5.0   
8149                  10             315.0   

In [240]:
optimized_estimators = 450
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = CatBoostRegressor(n_estimators=optimized_estimators, learning_rate=0.1, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

importances = model.get_feature_importance()
feature_names = X_train.columns

for name, imp in zip(feature_names, importances):
    print(f"{name}: {imp:.4f}")


0:	learn: 13.6939284	total: 8.65ms	remaining: 3.88s
1:	learn: 13.1853714	total: 15ms	remaining: 3.35s
2:	learn: 12.7540500	total: 27.1ms	remaining: 4.04s
3:	learn: 12.3692089	total: 35.7ms	remaining: 3.98s
4:	learn: 12.0379436	total: 43.5ms	remaining: 3.87s
5:	learn: 11.7517504	total: 49.3ms	remaining: 3.65s
6:	learn: 11.5085719	total: 57.1ms	remaining: 3.61s
7:	learn: 11.2927083	total: 63.4ms	remaining: 3.5s
8:	learn: 11.1010295	total: 75.4ms	remaining: 3.69s
9:	learn: 10.9341883	total: 82.4ms	remaining: 3.63s
10:	learn: 10.8004874	total: 93.3ms	remaining: 3.72s
11:	learn: 10.6775950	total: 102ms	remaining: 3.73s
12:	learn: 10.5617601	total: 112ms	remaining: 3.77s
13:	learn: 10.4699982	total: 122ms	remaining: 3.8s
14:	learn: 10.3830490	total: 129ms	remaining: 3.75s
15:	learn: 10.2998029	total: 138ms	remaining: 3.73s
16:	learn: 10.2227520	total: 145ms	remaining: 3.69s
17:	learn: 10.1556331	total: 152ms	remaining: 3.64s
18:	learn: 10.0931338	total: 158ms	remaining: 3.59s
19:	learn: 10.0

In [241]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['DOWNTIME','minute','clouds_all'])

In [242]:
y_pred_final = model.predict(X_val)

In [243]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [244]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [245]:
def generate_csv(input, y_pred, key='Final', filename='../outpu_final/TEST_cast_boost.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [246]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [247]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
X_train['rain_1h__6'] = X_train['rain_1h'].shift(-6)
X_train['rain_1h__3'] = X_train['rain_1h'].shift(-3)

X_train['snow_1h__6'] = X_train['snow_1h'].shift(-6)
X_train['snow_1h__3'] = X_train['snow_1h'].shift(-3)

print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','clouds_all', 'humidity'])

                 DATETIME  ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  \
5927  2018-10-01 12:15:00            224.5         0                 20   
8149  2018-10-01 12:15:00            756.0         0                 10   
12003 2018-10-01 12:15:00            153.0         0                 25   
13978 2018-10-01 12:30:00            153.0         0                 25   
23029 2018-10-01 12:30:00            224.5         0                 20   

       TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  \
5927              315.0              -5.0               465.0   
8149              315.0              -5.0               465.0   
12003             315.0              -5.0               465.0   
13978             300.0             -20.0               450.0   
23029             300.0             -20.0               450.0   

       WAIT_TIME_IN_2H   temp  dew_point  ...  \
5927              20.0  13.01       5.43  ...   
8149              10.0  13.01       5.43  ...   
12003      

In [248]:
optimized_estimators = 1900
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=0.009, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

-7.674991786485701 0.13117967004688835
                Model Evaluation Metrics:
Mean Absolute Error (MAE): 4.974875815857568
Root Mean Squared Error (RMSE): 6.620973309131593
R-squared (R2 ): 0.7850867208852705


In [249]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['clouds_all','humidity'])

In [250]:
y_pred_final_xgb = model.predict(X_val)

In [251]:
import pandas as pd
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [252]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [253]:
def generate_csv(input, y_pred, key='Final', filename='../outpu_final/TEST_cat_boost.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [254]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [255]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
X_train['rain_1h__6'] = X_train['rain_1h'].shift(-6)
X_train['rain_1h__3'] = X_train['rain_1h'].shift(-3)

X_train['snow_1h__6'] = X_train['snow_1h'].shift(-6)
X_train['snow_1h__3'] = X_train['snow_1h'].shift(-3)

print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','pressure', 'minute','dew_point'])

                 DATETIME  ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  \
5927  2018-10-01 12:15:00            224.5         0                 20   
8149  2018-10-01 12:15:00            756.0         0                 10   
12003 2018-10-01 12:15:00            153.0         0                 25   
13978 2018-10-01 12:30:00            153.0         0                 25   
23029 2018-10-01 12:30:00            224.5         0                 20   

       TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  \
5927              315.0              -5.0               465.0   
8149              315.0              -5.0               465.0   
12003             315.0              -5.0               465.0   
13978             300.0             -20.0               450.0   
23029             300.0             -20.0               450.0   

       WAIT_TIME_IN_2H   temp  dew_point  ...  \
5927              20.0  13.01       5.43  ...   
8149              10.0  13.01       5.43  ...   
12003      

In [256]:
optimized_estimators = 4500
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = LGBMRegressor(n_estimators=optimized_estimators, learning_rate=0.004, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3511
[LightGBM] [Info] Number of data points in the train set: 29614, number of used features: 23
[LightGBM] [Info] Start training from score 23.677484
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3502
[LightGBM] [Info] Number of data points in the train set: 29614, number of used features: 23
[LightGBM] [Info] Start training from score 23.534646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3508
[LightGBM] [Info] Number of data points in the train s

In [257]:
X_val = test_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['pressure', 'minute','dew_point'])

In [258]:
y_pred_final_lgbm = model.predict(X_val)

In [259]:
y_pred = (y_pred_final + y_pred_final_xgb+ y_pred_final_lgbm) / 3
generate_csv(test_file, y_pred)

Predictions saved to ../outpu_final/TEST_cat_boost.csv
