In [621]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [622]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print("                Model Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2 ): {r2}")

In [623]:
def generate_csv(input, y_pred, key='Validation', filename='../output/random_forest_improved_predictions.csv'):
    output = input[['DATETIME', 'ENTITY_DESCRIPTION_SHORT']].copy()
    output['y_pred'] = y_pred
    output['KEY'] = [key] * len(y_pred)
    output.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [624]:
train_route = '../data/waiting_times_train.csv'
validation_route = '../data/waiting_times_X_test_val.csv'
test_route = '../data/waiting_times_X_test_final.csv'
weather_route = '../data/weather_data.csv'

train_file = pd.read_csv(train_route)
validation_file = pd.read_csv(validation_route)
test_file = pd.read_csv(test_route)
weather_file = pd.read_csv(weather_route)

In [625]:
X_train = train_file.copy()
X_train['DATETIME'] = pd.to_datetime(X_train['DATETIME'], errors='coerce')

weather_file['DATETIME'] = pd.to_datetime(weather_file['DATETIME'], errors='coerce')
weather_file = weather_file.fillna(0)

X_train = pd.merge(X_train, weather_file, on='DATETIME', how='left')
X_train = pd.get_dummies(X_train, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train['minute'] = X_train['DATETIME'].dt.minute

X_train['TIME_TO_PARADE_1'] = X_train['TIME_TO_PARADE_1'].fillna(250)
X_train['TIME_TO_PARADE_2'] = X_train['TIME_TO_PARADE_2'].fillna(0)

X_train = X_train.sort_values(by='DATETIME')
X_train['rain_1h__6'] = X_train['rain_1h'].shift(-6)
X_train['rain_1h__3'] = X_train['rain_1h'].shift(-3)

X_train['snow_1h__6'] = X_train['snow_1h'].shift(-6)
X_train['snow_1h__3'] = X_train['snow_1h'].shift(-3)

print(X_train.head())

# Fill NaNs by column
for col in X_train.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_train[col].mean()
    std = X_train[col].std()
    
    # Find number of NaNs
    n_nan = X_train[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_train.loc[X_train[col].isna(), col] = random_values

X_train = X_train.sort_index()
y_train = X_train['WAIT_TIME_IN_2H']
X_train = X_train.select_dtypes(include=['number']).drop(columns=['WAIT_TIME_IN_2H','clouds_all', 'humidity'])

                 DATETIME  ADJUST_CAPACITY  DOWNTIME  CURRENT_WAIT_TIME  \
5927  2018-10-01 12:15:00            224.5         0                 20   
8149  2018-10-01 12:15:00            756.0         0                 10   
12003 2018-10-01 12:15:00            153.0         0                 25   
13978 2018-10-01 12:30:00            153.0         0                 25   
23029 2018-10-01 12:30:00            224.5         0                 20   

       TIME_TO_PARADE_1  TIME_TO_PARADE_2  TIME_TO_NIGHT_SHOW  \
5927              315.0              -5.0               465.0   
8149              315.0              -5.0               465.0   
12003             315.0              -5.0               465.0   
13978             300.0             -20.0               450.0   
23029             300.0             -20.0               450.0   

       WAIT_TIME_IN_2H   temp  dew_point  ...  \
5927              20.0  13.01       5.43  ...   
8149              10.0  13.01       5.43  ...   
12003      

In [626]:
optimized_estimators = 1900
'''max_score = float('-inf')
for n in range(100, 5000, 100):
    model = XGBRegressor(n_estimators=n, learning_rate=0.1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    score_mean = scores.mean()
    print(f"n_estimators: {n}, CV Mean RMSE: {-score_mean}")
    if score_mean > max_score:
        optimized_estimators = n
        max_score = score_mean'''



model = XGBRegressor(n_estimators=optimized_estimators, learning_rate=0.009, random_state=10)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print(scores.mean(), scores.std())
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
evaluate_model(y_train, y_pred)

-7.660283105824372 0.12363348343254574
                Model Evaluation Metrics:
Mean Absolute Error (MAE): 4.972195285533445
Root Mean Squared Error (RMSE): 6.608539742824717
R-squared (R2 ): 0.7858931369116848


In [627]:
X_val = validation_file.copy()
X_val['DATETIME'] = pd.to_datetime(X_val['DATETIME'], errors='coerce')

X_val = pd.merge(X_val, weather_file, on='DATETIME', how='left')
X_val = pd.get_dummies(X_val, columns=['ENTITY_DESCRIPTION_SHORT'], drop_first=True, dtype=int)
X_val['year'] = X_val['DATETIME'].dt.year
X_val['month'] = X_val['DATETIME'].dt.month
X_val['day'] = X_val['DATETIME'].dt.day
X_val['hour'] = X_val['DATETIME'].dt.hour
X_val['minute'] = X_val['DATETIME'].dt.minute

X_val['TIME_TO_PARADE_1'] = X_val['TIME_TO_PARADE_1'].fillna(250)
X_val['TIME_TO_PARADE_2'] = X_val['TIME_TO_PARADE_2'].fillna(0)

X_val = X_val.sort_values(by='DATETIME')
X_val['rain_1h__6'] = X_val['rain_1h'].shift(-6)
X_val['rain_1h__3'] = X_val['rain_1h'].shift(-3)

X_val['snow_1h__6'] = X_val['snow_1h'].shift(-6)
X_val['snow_1h__3'] = X_val['snow_1h'].shift(-3)

# Fill NaNs by column
for col in X_val.select_dtypes(include=['number']).columns:
    # Compute mean and std of non-NaN values
    mean = X_val[col].mean()
    std = X_val[col].std()
    
    # Find number of NaNs
    n_nan = X_val[col].isna().sum()
    
    # Generate random numbers for NaNs
    random_values = np.random.normal(loc=mean, scale=std, size=n_nan)
    
    # Fill NaNs
    X_val.loc[X_val[col].isna(), col] = random_values

X_val = X_val.sort_index()
X_val = X_val.select_dtypes(include=['number']).drop(columns=['clouds_all','humidity'])

In [628]:
y_pred_final = model.predict(X_val)
generate_csv(validation_file, y_pred_final)

Predictions saved to ../output/random_forest_improved_predictions.csv
