In [5]:
from typing import Dict

In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import joblib
import mlflow

In [11]:
def rmsle(y_true, y_pred) -> float:
    pass

def rmse(y_true, y_pred) -> float:
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_score = make_scorer(rmse, greater_is_better=False)

In [23]:
def add_key_prefix(d: Dict, prefix = 'best_') -> Dict:
    return {prefix + key: value for key, value in d.items()}

In [31]:
def grid_search(ds: pd.DataFrame):
        
    y = np.log1p(ds['meter_reading'])
    X = ds.iloc[:, 1:].values
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    
    param_grid = dict(
        n_estimators=[20, 40, 60, 80, 100],
        max_depth=[4, 8, 12, None],
        max_features=['auto', 'sqrt'],
    )
        
    mlflow.set_experiment('baseline')
    with mlflow.start_run() as run:
        
        mlflow.log_params(param_grid)
        
        regressor = GridSearchCV(
            RandomForestRegressor(),
            param_grid=param_grid,
            cv=3,
            scoring=rmse_score,
            verbose=2,
            refit=True,
        )

        regressor.fit(X_train, y_train)
        
        best_model = regressor.best_estimator_
        best_param = add_key_prefix(regressor.best_params_)
        best_rmse = - regressor.best_score_

        joblib.dump(best_model, 'out/model.sav')

        mlflow.log_params(best_param)
        mlflow.log_metrics(dict(
            rmse=best_rmse,
        ))
        mlflow.log_artifact('./out/model.sav')
        mlflow.end_run()

In [14]:
dataset_train = pd.read_csv('dataset_train.csv')

In [28]:
grid_search(dataset_train.sample(frac=0.01))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] max_depth=2, max_features=auto, n_estimators=10 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. max_depth=2, max_features=auto, n_estimators=10, total=   2.6s
[CV] max_depth=2, max_features=auto, n_estimators=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


[CV] .. max_depth=2, max_features=auto, n_estimators=10, total=   2.0s
[CV] max_depth=2, max_features=auto, n_estimators=10 .................
[CV] .. max_depth=2, max_features=auto, n_estimators=10, total=   2.1s
[CV] max_depth=2, max_features=auto, n_estimators=20 .................
[CV] .. max_depth=2, max_features=auto, n_estimators=20, total=   3.3s
[CV] max_depth=2, max_features=auto, n_estimators=20 .................
[CV] .. max_depth=2, max_features=auto, n_estimators=20, total=   3.2s
[CV] max_depth=2, max_features=auto, n_estimators=20 .................
[CV] .. max_depth=2, max_features=auto, n_estimators=20, total=   3.3s
[CV] max_depth=2, max_features=sqrt, n_estimators=10 .................
[CV] .. max_depth=2, max_features=sqrt, n_estimators=10, total=   0.8s
[CV] max_depth=2, max_features=sqrt, n_estimators=10 .................
[CV] .. max_depth=2, max_features=sqrt, n_estimators=10, total=   0.9s
[CV] max_depth=2, max_features=sqrt, n_estimators=10 .................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  1.0min finished
