In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from typing import Any, Dict, Tuple

In [8]:
dataset = pd.read_parquet("../lumen_dataset/boris_step_1.parquet")
dataset.head()

Unnamed: 0,room_cnt,day_of_week,day_of_year,weather,scaled_room_id,isEvent
0,1,2,1,0.5,0.982,0
1,3,2,1,0.5,0.038,0
2,4,2,1,0.5,1.0,0
3,11,2,1,0.5,0.241,0
4,3,2,1,0.5,0.011,0


## Dataset Setup

In [9]:
X = dataset.drop('room_cnt', axis=1) 
y = dataset['room_cnt']  

random_state parameter ensures that the split is reproducible. We'll get the same train-test split each time we run the code with the same random_state.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print(X_train.head())

      day_of_week  day_of_year  weather  scaled_room_id  isEvent
1042            1          210      0.0           0.241        0
2661            2          181      1.0           1.000        0
2715            7          193      0.0           0.241        0
1258            2          253      0.0           0.337        0
756             6          152      0.0           0.982        0


## Model Training

Mapping the scaled room id to the normal one for easier understanding

In [23]:
scaled_to_normal_id = {
    0.001: 1,
    0.982: 2,
    0.337: 3,
    0.038: 4,
    1.000: 5,
    0.241: 6,
    0.011: 7,
    0.002: 11
}

### Utils For Evaluation

In [33]:
def print_predictions(model: BaseEstimator, X_test: pd.DataFrame, y_test: pd.Series, num_predictions: int = 10) -> None:
    """
    This function takes a trained model, test features (X_test), and true target values (y_test),
    and prints out the model's predictions compared to the actual values alongside the input parameters
    for a specified number of cases, including mapping scaled_room_id to room_category_id.
    
    Parameters:
    - model: The trained machine learning model.
    - X_test: The test features, expected to be a Pandas DataFrame.
    - y_test: The actual target values, expected to be a Pandas Series.
    - num_predictions: The number of predictions to print. Default is 10.
    """
    predictions = model.predict(X_test)
    
    print("Input Params and Predictions vs Actual Values for 'room_cnt'")
    print("-" * 70)
    for index, (input_params, true_value, prediction) in enumerate(zip(X_test.itertuples(index=False), y_test, predictions)):
        if index >= num_predictions: break
        room_category_id = scaled_to_normal_id.get(input_params.scaled_room_id, "Unknown")
        print(f"Input Params: Day of Week: {input_params.day_of_week}, Day of Year: {input_params.day_of_year}, Weather: {input_params.weather}, Room Category: {room_category_id}, Is Event: {input_params.isEvent}, Actual: {true_value}, Predicted: {prediction:.2f}")


In [31]:
def evaluate_metrics(model: BaseEstimator, X_test: pd.DataFrame, y_test: pd.Series) -> None:
    """
    Evaluates a regression model on the test set using various metrics and prints the results in a table.
    
    Parameters:
    - model: The trained machine learning model.
    - X_test: The test features, expected to be a Pandas DataFrame.
    - y_test: The actual target values, expected to be a Pandas Series.
    """
    predictions = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse) 
    r2 = r2_score(y_test, predictions)
    
    metrics_df = pd.DataFrame({
        "Metric": ["Mean Absolute Error (MAE)", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "R^2 Score"],
        "Value": [mae, mse, rmse, r2]
    })
    
    pd.set_option('display.float_format', lambda x: '%.4f' % x)
    
    print(metrics_df.to_string(index=False))

In [40]:
def find_hyperparameters(model: BaseEstimator, param_grid: Dict[str, Any], X_train: pd.DataFrame, y_train: pd.Series, cv: int = 5) -> Tuple[Dict[str, Any], float]:
    """
    Finds the optimal hyperparameters for the given model using GridSearchCV.

    Parameters:
    - model: The machine learning model (estimator) for which to find the optimal hyperparameters.
    - param_grid: The hyperparameter grid to search over.
    - X_train: The training feature dataset.
    - y_train: The training target vector.
    - cv: Number of cross-validation folds. Default is 5.

    Returns:
    - A tuple containing the best hyperparameters and the best score.
    """
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
    
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print("Best Hyperparameters:", best_params)
    print("Best Score (Neg. Mean Squared Error):", best_score)
    
    return best_params, best_score

#### Linear Regression

In [14]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

In [32]:
evaluate_metrics(linear_regression_model, X_test, y_test)

                        Metric    Value
     Mean Absolute Error (MAE)   8.0322
      Mean Squared Error (MSE) 153.0743
Root Mean Squared Error (RMSE)  12.3723
                     R^2 Score   0.2709


In [26]:
print_predictions(linear_regression_model, X_test, y_test)

Input Params and Predictions vs Actual Values for 'room_cnt'
----------------------------------------------------------------------
Input Params: Day of Week: 4, Day of Year: 141, Weather: 0.0, Room Category: 7), Is Event: 0, Actual: 1, Predicted: 1.13
Input Params: Day of Week: 3, Day of Year: 16, Weather: 1.0, Room Category: 4), Is Event: 0, Actual: 2, Predicted: 0.92
Input Params: Day of Week: 5, Day of Year: 74, Weather: 0.0, Room Category: 7), Is Event: 0, Actual: 2, Predicted: 0.65
Input Params: Day of Week: 7, Day of Year: 69, Weather: 1.0, Room Category: 4), Is Event: 0, Actual: 2, Predicted: 1.35
Input Params: Day of Week: 1, Day of Year: 89, Weather: 1.0, Room Category: 6), Is Event: 0, Actual: 2, Predicted: 5.76
Input Params: Day of Week: 2, Day of Year: 64, Weather: 1.0, Room Category: 3), Is Event: 0, Actual: 17, Predicted: 7.63
Input Params: Day of Week: 1, Day of Year: 196, Weather: 1.0, Room Category: 4), Is Event: 1, Actual: 1, Predicted: 2.23
Input Params: Day of Week

#### Random Forest Regression

In [37]:
random_forest_model = RandomForestRegressor()

In [38]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [41]:
best_params, best_score = find_hyperparameters(random_forest_model, param_grid, X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best Score (Neg. Mean Squared Error): -112.37303483396242


In [42]:
random_forest_optimized_model = RandomForestRegressor(**best_params)
random_forest_optimized_model.fit(X_train, y_train)

In [43]:
evaluate_metrics(random_forest_optimized_model, X_test, y_test)

                        Metric   Value
     Mean Absolute Error (MAE)  5.6833
      Mean Squared Error (MSE) 89.5065
Root Mean Squared Error (RMSE)  9.4608
                     R^2 Score  0.5736


In [44]:
print_predictions(random_forest_optimized_model, X_test, y_test)

Input Params and Predictions vs Actual Values for 'room_cnt'
----------------------------------------------------------------------
Input Params: Day of Week: 4, Day of Year: 141, Weather: 0.0, Room Category: 7, Is Event: 0, Actual: 1, Predicted: 1.07
Input Params: Day of Week: 3, Day of Year: 16, Weather: 1.0, Room Category: 4, Is Event: 0, Actual: 2, Predicted: 1.46
Input Params: Day of Week: 5, Day of Year: 74, Weather: 0.0, Room Category: 7, Is Event: 0, Actual: 2, Predicted: 1.30
Input Params: Day of Week: 7, Day of Year: 69, Weather: 1.0, Room Category: 4, Is Event: 0, Actual: 2, Predicted: 1.53
Input Params: Day of Week: 1, Day of Year: 89, Weather: 1.0, Room Category: 6, Is Event: 0, Actual: 2, Predicted: 2.75
Input Params: Day of Week: 2, Day of Year: 64, Weather: 1.0, Room Category: 3, Is Event: 0, Actual: 17, Predicted: 10.12
Input Params: Day of Week: 1, Day of Year: 196, Weather: 1.0, Room Category: 4, Is Event: 1, Actual: 1, Predicted: 1.65
Input Params: Day of Week: 6, D

## Final Results

| Model            | MAE   | MSE    | RMSE  | R^2   |
|------------------|-------|--------|-------|-------|
| LinearRegression | 8.0322| 153.074| 12.372| 0.2709|
| Random Forest    | 5.6833| 89.5065| 9.4608| 0.5736|

## Legend

#### Mean Absolute Error (MAE)

- **Formula**: MAE = $ \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i| $
- **What it tells us**: MAE measures the average magnitude of the errors in a set of predictions, without considering their direction. It's the average over the test sample of the absolute differences between prediction and actual observation where all individual differences have equal weight.
- **Interpretation**: A lower MAE indicates better model performance. It gives an idea of how wrong the predictions were on average.

#### Mean Squared Error (MSE)

- **Formula**: $ MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 $
- **What it tells us**: MSE measures the average of the squares of the errors, i.e., the average squared difference between the estimated values and the actual value. It places more weight on larger errors compared to smaller ones, due to the squaring of each term.
- **Interpretation**: A lower MSE indicates better model performance. However, because MSE is in squared units of the output variable, it can sometimes be hard to interpret directly.

#### Root Mean Squared Error (RMSE)

- **Formula**: $ RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2} $
- **What it tells us**: RMSE is the square root of the mean of the squared errors. The squaring process penalizes larger errors more heavily than smaller ones, and taking the square root brings the units back to the original units of the output variable.
- **Interpretation**: A lower RMSE indicates better model performance. It's easier to interpret than MSE because it's in the same units as the target variable.

#### Coefficient of Determination (R² Score)

- **Formula**: R^2 = $ 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} $
- **What it tells us**: R² score represents the proportion of the variance for the dependent variable that's explained by the independent variables in a regression model. It provides a measure of how well observed outcomes are replicated by the model, based on the proportion of total variation of outcomes explained by the model.
- **Interpretation**: An R² score of 1 indicates perfect agreement between the actual and predicted values, while an R² score of 0 indicates that the model explains none of the variability of the response data around its mean. Higher values are generally better, though a very high R² can indicate overfitting.