In [1]:
# !pip install mlflow
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing


housing = fetch_california_housing()
print(housing)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [4]:
#data prep
df = pd.DataFrame(housing.data,columns = housing.feature_names)
df['Price'] = housing.target
df.head

<bound method NDFrame.head of        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Lo

Train Test split, Model HyperParameter Tuning, MLFlow Experiments

In [7]:
from urllib.parse import urlparse
#divide data into independent and dependent features

X = df.drop(columns = ['Price'])
y = df['Price']

#split data into train test split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20)
from mlflow.models import infer_signature
signature = infer_signature(X_train,y_train)

#Define hyper parameter grid
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [5,10,None],
    'min_samples_split': [2,5],
    'min_samples_leaf': [1,2]
}


def hyperparameter_tuning(X_train,y_train, param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2, scoring='neg_mean_squared_error')
    grid_search.fit(X_train,y_train)
    return grid_search

with mlflow.start_run():
    # hyper parameter tuning
    grid_search = hyperparameter_tuning(X_train,y_train,param_grid)
    #Get the best model 
    best_model = grid_search.best_estimator_
    #eval the best model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)

    #Log best params and metrics 
    mlflow.log_param('best_n_estimators', grid_search.best_params_['n_estimators'])
    mlflow.log_param('best_max_depth', grid_search.best_params_['max_depth'])
    mlflow.log_param('best_min_samples_split', grid_search.best_params_['min_samples_split'])
    mlflow.log_param('best_min_samples_leaf', grid_search.best_params_['min_samples_leaf'])
    mlflow.log_metric('mse',mse)
    mlflow.set_tracking_uri(uri='http://localhost:5000')
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store !='file':
        mlflow.sklearn.log_model(best_model, "model",registered_model_name='Best RandomForest Model')
    else:
        mlflow.sklearn.log_model(best_model, 'model',signature=signature)

    print(f'Best HyperParameters :{grid_search.best_params_}')
    print(f'Mean Squared Error :{mse}')

Fitting 3 folds for each of 24 candidates, totalling 72 fits


Registered model 'Best RandomForest Model' already exists. Creating a new version of this model...
2025/12/19 22:47:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best RandomForest Model, version 2
Created version '2' of model 'Best RandomForest Model'.


Best HyperParameters :{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Squared Error :0.25475087899336774
üèÉ View run beautiful-hog-70 at: http://localhost:5000/#/experiments/0/runs/9a8b073baadc49278f3c6413ec6f9aee
üß™ View experiment at: http://localhost:5000/#/experiments/0
