- For some unseen locations, a general models is trained on the entire dataset made by combining extracted counts from all the camera ids. This model is used for predicting turning movement counts for unseen location for which expected counts are not generated.

- For training the models, the same approach is used here as explained in `train.ipynb`. 

## Importing Dependencies

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Tuple

from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from sklearn.model_selection import train_test_split

from cuml.linear_model import LinearRegression, Lasso, ElasticNet
from cuml.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from joblib import parallel_backend
from ray.util.joblib import register_ray
register_ray()
import ray
from cuml.common.device_selection import set_global_device_type

np.random.seed(42)

### Connecting to a ray cluster

In [2]:
set_global_device_type("GPU")
ray.init("auto")

2024-08-24 18:25:07,568	INFO worker.py:1596 -- Connecting to existing Ray cluster at address: 10.19.3.211:6379...
2024-08-24 18:25:07,573	INFO worker.py:1772 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.12
Ray version:,2.34.0
Dashboard:,http://127.0.0.1:8265


In [3]:
dataset_dir = Path(r"/home/user1/codes/bangaluru_mobility_codes/submission/future_counts/data")
save_dir = Path(r"/home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results")

In [4]:
acc_models = {"RandomForest","LinearRegression","Lasso","ElasticNet","XGBRegressor","CatBoostingRegressor"}
target_column = "count"

In [5]:
# Specify parameters and distributions to sample from for hyper-parameter tuning
params={
        "RandomForest":{
            'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'max_features':['sqrt','log2'],
            'n_estimators': [8,16,32,64,128,256,512,1024],
            'bootstrap': [False, True]
        },
        "GradientBoosting":{
            'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
            'learning_rate':[.1,.01,.05,.001],
            'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
            'criterion':['squared_error', 'friedman_mse'],
            'max_features':['sqrt','log2'],
            'n_estimators': [8,16,32,64,128,256]
        },
        "LinearRegression":{
            'algorithm': ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
        },
        "XGBRegressor":{
            'learning_rate': np.linspace(0.001,0.2,8),
            'n_estimators': np.arange(8,512,50),
            'gamma': np.linspace(0,5,10),
            'reg_alpha': np.logspace(-3,1,10),
            'reg_lambda': np.logspace(-3,1,10)
        },
        "CatBoostingRegressor":{
            'depth': [4,6,8,10],
            'l2_leaf_reg': np.logspace(-2,1,10),
            'learning_rate': np.linspace(0.001,0.2,8),
            'iterations': np.arange(10,500,50)
        },
        "AdaBoostRegressor":{
            'learning_rate': np.linspace(0.001,0.2,8),
            'loss':['linear','square','exponential'],
            'n_estimators': np.arange(8,512,50),
        },
        "Lasso":{
            'alpha':  (np.logspace(-8, 8, 100)),
        },
        "ElasticNet": {
            'alpha': np.logspace(-4,2,30),
            'l1_ratio': np.linspace(0,1,30),
            'fit_intercept': [True, False]
        }
 }

In [6]:
def load_dateset(path: str) -> pd.DataFrame:
    """
    Load and preprocess a dataset from a CSV file.

    Removes any rows containing NaN values, and converts certain columns to more memory-efficient data types.
    The columns are then reordered for easier access.

    Parameters:
    path (str): The file path to the CSV file containing the dataset.

    Returns:
    pd.DataFrame
    """
    df = pd.read_csv(path)
    df = df.dropna()
    df["class"] = df["class"].astype(np.int8)
    df["zone_in"] = df["zone_in"].astype(np.int8)
    df["zone_out"] = df["zone_out"].astype(np.int8)
    df["count"] = df["count"].astype(np.int16)
    df["last_15_min_count"] = df["last_15_min_count"].astype(np.int16)
    df["last_30_min_count"] = df["last_30_min_count"].astype(np.int16)
    df = df.iloc[:,[0,1,2,4,5,3]]

    return df

In [7]:

def load_models() -> dict:
    """
    Initializes and returns a dictionary of machine learning models.

    Returns:
        dict: a dictonary of machine learning models with key as model name and value as model object 
    """
    models = {
        "ElasticNet": ElasticNet(copy_X=False),
        "XGBRegressor":XGBRegressor(),
        "LinearRegression":LinearRegression(copy_X=False),
        "RandomForest":RandomForestRegressor(),
        "GradientBoosting":GradientBoostingRegressor(),
        "CatBoostingRegressor":CatBoostRegressor(verbose=False),
        "AdaBoostRegressor":AdaBoostRegressor(),
        "Lasso": Lasso(copy_X=False),
    }

    return models

In [8]:
def split_dataset(df: pd.DataFrame, cam_id: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Splits the dataset into training and testing sets, scales the input features, and saves the scaler into a pickled file. 

    Parameters:
    -----------
    df (pd.DataFrame): The DataFrame containing the dataset to be split and scaled.
    
    cam_id (str): A unique identifier for the camera or dataset being processed. This ID is used to save the scaler with a specific name.

    Returns:
    --------
    Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
    """
    scaler = StandardScaler()
    
    split = 0.20
    train_set, test_set = train_test_split(df, test_size=split, shuffle=True, random_state=42)
    input_feature_train_df = train_set.drop([target_column], axis=1)
    target_feature_train_df = pd.DataFrame(train_set[target_column])

    input_feature_test_df = test_set.drop([target_column], axis=1)
    target_feature_test_df = pd.DataFrame(test_set[target_column])
    
    input_feature_train_arr = scaler.fit_transform(input_feature_train_df)
    input_feature_test_arr = scaler.transform(input_feature_test_df)

    train_arr = np.c_[
        input_feature_train_arr, np.array(target_feature_train_df)
    ]

    test_arr = np.c_[
        input_feature_test_arr, np.array(target_feature_test_df)
    ]

    dump(scaler, save_dir / "scalers" / f"{cam_id}.pkl")
    print(f"SC saved to {save_dir / 'scalers' / f'{cam_id}.pkl'}")

    X_train, y_train = train_arr[:, :-1], train_arr[:, -1]
    X_test, y_test = test_arr[:, :-1], test_arr[:, -1]

    return X_train, y_train, X_test, y_test



In [9]:
# Make a dataset for all the camera ids
dfs = []
for file in os.listdir(dataset_dir):
    if file.endswith(".csv"):
        df = load_dateset(dataset_dir / file)
        dfs.append(df)
        
df = pd.concat(dfs, axis=0, ignore_index=True)

### Train models

In [11]:
# Training a general model
cam_id = "general"
models_report = {
    cam_id: dict()
}

# Train and test split with scaled features
X_train, y_train, X_test, y_test = split_dataset(df, cam_id)

# Initalize the models
models = load_models()

os.makedirs(save_dir / "models" / cam_id, exist_ok=True)

for model_name, model in models.items():
    print(f"Now evaluating: {model_name}\n\n")

    para = params[model_name]

    # Hyperparameter tuning for GPU supported models
    if model_name in acc_models:
        gs = RandomizedSearchCV(model, para, cv=5, n_iter=25, verbose=4, n_jobs=10)
        with parallel_backend('ray', ray_remote_args=dict(num_gpus=0.01)):
            gs.fit(X_train, y_train)
    else:
        # Hyperparameter tuning for non-GPU supported models
        gs = RandomizedSearchCV(model, para, cv=5, n_iter=30, verbose=4, n_jobs=-1)
        with parallel_backend('ray'):
            gs.fit(X_train, y_train)

        
    model.set_params(**gs.best_params_)
    # Fit the model with best parameters
    model.fit(X_train, y_train)

    # Save a pickled copy of the model
    dump(model, save_dir / "models" / cam_id / f"{model_name}.pkl")
    print(f"{model_name} saved to {save_dir / 'models' / cam_id}") 

    # Evaluate the model
    y_test_pred = model.predict(X_test)
    test_model_score = r2_score(y_test, y_test_pred)
    
    y_test_org = y_test.reshape(-1,1)
    y_test_pred = y_test_pred.reshape(-1,1)
    y_test_pred = np.array(y_test_pred, dtype=int)

    test_mae = np.mean(np.absolute(y_test_pred - y_test_org))
    test_rmse = np.sqrt(np.mean((y_test_pred - y_test_org)**2))

    parameters = gs.best_params_

    models_report[cam_id][model_name] = (test_model_score, test_mae, test_rmse, parameters)
    

print("Best model score and parameters:\n")
print(sorted(models_report[cam_id].items(), key= lambda kv: (-kv[1][0], kv[1][1], kv[1][2]))[0])

print("\n\n\n")  




SC saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/scalers/general.pkl
[I] [18:25:30.287608] Unused keyword parameter: copy_X during cuML estimator initialization
[I] [18:25:30.327929] Unused keyword parameter: copy_X during cuML estimator initialization
Now evaluating: ElasticNet


Fitting 5 folds for each of 25 candidates, totalling 125 fits
ElasticNet saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: XGBRegressor


Fitting 5 folds for each of 25 candidates, totalling 125 fits
XGBRegressor saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: LinearRegression




[36m(PoolActor pid=2537627)[0m [CV 1/5] END alpha=0.12689610031679222, fit_intercept=False, l1_ratio=0.4482758620689655;, score=0.879 total time=   0.3s
[36m(PoolActor pid=2537627)[0m [CV 1/5] END alpha=0.0002592943797404667, fit_intercept=False, l1_ratio=0.41379310344827586;, score=0.885 total time=   0.0s
[36m(PoolActor pid=2537626)[0m [CV 5/5] END alpha=0.07880462815669911, fit_intercept=True, l1_ratio=1.0;, score=0.955 total time=   0.0s
[36m(PoolActor pid=2540208)[0m [CV 1/5] END gamma=2.7777777777777777, learning_rate=0.2, n_estimators=308, reg_alpha=0.00774263682681127, reg_lambda=0.4641588833612777;, score=0.959 total time=   0.5s
[36m(PoolActor pid=2537629)[0m [CV 2/5] END alpha=0.00452035365636024, fit_intercept=False, l1_ratio=0.9655172413793103;, score=0.882 total time=   0.0s[32m [repeated 58x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/co



Fitting 5 folds for each of 5 candidates, totalling 25 fits
LinearRegression saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: RandomForest


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[36m(PoolActor pid=2544637)[0m   ret = func(*args, **kwargs)


[36m(PoolActor pid=2544639)[0m [CV 2/5] END bootstrap=False, criterion=friedman_mse, max_features=log2, n_estimators=32;, score=0.933 total time=   2.8s
[36m(PoolActor pid=2543466)[0m [CV 5/5] END .....................algorithm=svd;, score=0.955 total time=   0.1s[32m [repeated 4x across cluster][0m
[36m(PoolActor pid=2543465)[0m [CV 4/5] END ......................algorithm=qr;, score=0.954 total time=   0.0s[32m [repeated 4x across cluster][0m
[36m(PoolActor pid=2543465)[0m [CV 4/5] END ..............algorithm=svd-jacobi;, score=0.954 total time=   0.0s[32m [repeated 4x across cluster][0m
[36m(PoolActor pid=2543470)[0m [CV 4/5] END .....................algorithm=eig;, score=0.954 total time=   0.2s[32m [repeated 4x across cluster][0m
[36m(PoolActor pid=2543470)[0m [CV 4/5] END ..................algorithm=svd-qr;, score=0.954 total time=   0.0s[32m [repeated 4x across cluster][0m
[36m(PoolActor pid=2544637)[0m [CV 5/5] END bootstrap=True, criterion=absolute_erro

  ret = func(*args, **kwargs)


[36m(PoolActor pid=2544637)[0m Exception ignored in: <cyfunction RandomForestRegressor.__del__ at 0x7d46399ab850>
[36m(PoolActor pid=2544637)[0m Traceback (most recent call last):
[36m(PoolActor pid=2544637)[0m   File "randomforestregressor.pyx", line 321, in cuml.ensemble.randomforestregressor.RandomForestRegressor.__del__
[36m(PoolActor pid=2544637)[0m   File "randomforestregressor.pyx", line 337, in cuml.ensemble.randomforestregressor.RandomForestRegressor._reset_forest_data
[36m(PoolActor pid=2544637)[0m AttributeError: 'NoneType' object has no attribute 'free_treelite_model'
[36m(PoolActor pid=2544640)[0m   ret = func(*args, **kwargs)[32m [repeated 9x across cluster][0m


RandomForest saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: GradientBoosting


Fitting 5 folds for each of 30 candidates, totalling 150 fits
GradientBoosting saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: CatBoostingRegressor


Fitting 5 folds for each of 25 candidates, totalling 125 fits
CatBoostingRegressor saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: AdaBoostRegressor


Fitting 5 folds for each of 30 candidates, totalling 150 fits
AdaBoostRegressor saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_training/results/models/general
Now evaluating: Lasso


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Lasso saved to /home/user1/codes/bangaluru_mobility_codes/submission/future_counts/model_

[36m(PoolActor pid=2549103)[0m [CV 1/5] END criterion=squared_error, learning_rate=0.001, loss=absolute_error, max_features=log2, n_estimators=8, subsample=0.8;, score=-0.076 total time=   0.3s
[36m(PoolActor pid=2544645)[0m [CV 3/5] END bootstrap=False, criterion=poisson, max_features=log2, n_estimators=1024;, score=0.934 total time=  43.4s[32m [repeated 3x across cluster][0m
[36m(PoolActor pid=2548418)[0m [CV 1/5] END criterion=squared_error, learning_rate=0.1, loss=absolute_error, max_features=sqrt, n_estimators=8, subsample=0.6;, score=-0.076 total time=   0.5s
[36m(PoolActor pid=2548435)[0m [CV 5/5] END criterion=friedman_mse, learning_rate=0.001, loss=absolute_error, max_features=log2, n_estimators=32, subsample=0.6;, score=-0.076 total time=   0.9s
[36m(PoolActor pid=2548426)[0m [CV 1/5] END criterion=squared_error, learning_rate=0.01, loss=squared_error, max_features=log2, n_estimators=8, subsample=0.8;, score=0.124 total time=   0.3s
[36m(PoolActor pid=2548436)[0