In [None]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import cross_val_score, KFold, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns   
import warnings
import pickle

warnings.filterwarnings('ignore')

In [2]:
# Load data
train_df = pd.read_parquet('../data/processed/train.parquet')
test_df = pd.read_parquet('../data/processed/test.parquet')


In [3]:
# create sklearn pipeline with pre-processing of numerical and categorical features
def pipeline(train_df):
    """Create a machine learning pipeline with preprocessing and model"""
    numeric_features = train_df.drop("resale_price", axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
    print(f"Numeric features: {numeric_features}")
    print(f"Categorical features: {categorical_features}")

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])
    return pipeline

In [4]:
# train model
def train_model(pipeline, train_df):
    """Train the model using cross-validation"""
    print("Training model")
    X = train_df.drop(columns=['resale_price'])
    y = train_df['resale_price']
    
    kf = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_root_mean_squared_error',
                             verbose=1)
    
    print(f"Cross-validated RMSE: {-scores.mean():.2f} ± {scores.std():.2f}")
    
    pipeline.fit(X, y)
    
    return pipeline

In [None]:
# define mlflow experiment and trigger autologging
mlflow.set_experiment("Bike Resale Price Prediction")
mlflow.set_tracking_uri("http://127.0.0.1:8080/")



2025/08/25 17:22:12 INFO mlflow.tracking.fluent: Experiment with name 'Bike Resale Price Prediction' does not exist. Creating a new experiment.
2025/08/25 17:22:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [8]:
# Training model with the pipeline
with mlflow.start_run() as run:
    trained_pipeline = train_model(pipeline=pipeline(train_df), train_df=train_df)
    mlflow.autolog()


Numeric features: ['flat_age_years', 'floor_area_sqm', 'days_from_earliest_data']
Categorical features: ['town', 'flat_type', 'flat_model_revised']
Training model


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.1s finished


Cross-validated RMSE: 86993.16 ± 24371.27


2025/08/25 17:25:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run handsome-yak-749 at: http://127.0.0.1:8080/#/experiments/738064824697976650/runs/d15a20a4efcc4a22aa5b44f03bd19f99
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/738064824697976650


In [None]:
# By right still need to implement final test on test set, but skipping for now

In [None]:
# Access mlflow dashboard at http://127.0.0.1:8080 after running mlflow server --host 127.0.0.1 --port 8080 in the terminal

In [None]:
# Registering best model to MLflow Model Registry
mlflow.register_model(
    model_uri="runs:/d15a20a4efcc4a22aa5b44f03bd19f99/model", # run_id is from the experiment, not the model
    name="HDBResalePricePrediction"
)

Successfully registered model 'HDBResalePricePrediction'.
2025/08/25 17:31:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: HDBResalePricePrediction, version 1
Created version '1' of model 'HDBResalePricePrediction'.


<ModelVersion: aliases=[], creation_timestamp=1756114277226, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1756114277226, metrics=None, model_id=None, name='HDBResalePricePrediction', params=None, run_id='d15a20a4efcc4a22aa5b44f03bd19f99', run_link='', source='models:/m-14650b173852448b9bf9f35229dd184a', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [None]:
# Reloading best model, saving to models directory for deployment
model_uri = f"runs:/d15a20a4efcc4a22aa5b44f03bd19f99/model"
loaded_model = mlflow.sklearn.load_model(model_uri)
print("Predictions: ", loaded_model.predict(test_df.drop(columns=['resale_price']).iloc[:5]))

with open('../models/champion_model.pkl', 'wb') as f:
    pickle.dump(loaded_model, f)

Predictions:  [224093.93676205 234874.77441391 385558.81030504 362062.89839062
 412411.28106437]
