In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

import mlflow
from mlflow.models import infer_signature
import mlflow.sklearn
import mlflow.pyfunc

# Suppress some common warnings for cleaner output
warnings.filterwarnings("ignore")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# ========== 1. DATA LOADING AND PREPARATION ==========
# Define URL and column names for the Auto-MPG dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = [
    "mpg", "cylinders", "displacement", "horsepower",
    "weight", "acceleration", "model_year", "origin", "car_name"
]

# Load the dataset. '?' is a placeholder for missing values.
df = pd.read_csv(url, delim_whitespace=True, names=column_names, na_values="?")

# Drop the 'car_name' column as it's not useful for the model
df = df.drop(columns=['car_name'])

# Define features (X) and target (y)
X = df.drop(columns=['mpg'])
y = df['mpg']

# ========== 2. PREPROCESSING PIPELINE ==========
# This section creates a reproducible data preprocessing pipeline.
# It's more robust than manual steps as it can be saved with the model.

# Define which columns are numerical and which are categorical
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Create a transformer for numerical features
# It will fill missing values with the median and then scale the data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create a transformer for categorical features
# It will use one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformers into a single preprocessor using ColumnTransformer
# This applies the correct transformation to each column type
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # Drop any columns not specified
)

# ========== 3. SPLIT DATA ==========
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========== 4. MODELS TO TRAIN ==========
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=30, random_state=42),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(n_neighbors=5)
}

# ========== 5. MLflow SETUP ==========
# Set the MLflow tracking server URI and experiment name
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("AutoMPG_Model_Training_v3_Refined")

best_rmse = float('inf')
best_model_uri = ""
registered_model_name = "BestAutoMPGModel"

# ========== 6. TRAIN & TRACK MODELS ==========
print("Starting MLflow experiment to train models...")
for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        # Create a full pipeline that includes the preprocessor and the model
        full_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

        # Train the full pipeline with the raw data. The pipeline handles all preprocessing steps.
        full_pipeline.fit(X_train, y_train)
        y_pred = full_pipeline.predict(X_test)

        # Calculate metrics 
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Log parameters and metrics to MLflow
        mlflow.log_param("model_type", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2_score", r2)
        mlflow.log_metric("mape", mape)

        # Infer and log the model signature for the full pipeline
        signature = infer_signature(X_train, full_pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=full_pipeline,
            artifact_path="model",
            input_example=X_train.head(5),
            signature=signature
        )

        print(f"{name} run completed with RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}, MAPE: {mape:.2f}")

        # Check if this is the best model so far
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_uri = f"runs:/{run.info.run_id}/model"
            print(f"  New best model found: {name}")

# ========== 7. REGISTER THE BEST MODEL ==========
if best_model_uri:
    print(f"\nRegistering the best model with RMSE: {best_rmse:.2f}")
    result = mlflow.register_model(
        model_uri=best_model_uri,
        name=registered_model_name
    )
    print(f"Registered model: {result.name} version {result.version}")
else:
    print("\nNo models were trained, cannot register.")

Starting MLflow experiment to train models...




LinearRegression run completed with RMSE: 8.20, MAE: 2.26, R2: 0.85, MAPE: 0.11
  New best model found: LinearRegression
🏃 View run LinearRegression at: http://localhost:5000/#/experiments/4/runs/9e546da64a6d4f19a78703599b0a3082
🧪 View experiment at: http://localhost:5000/#/experiments/4




Ridge run completed with RMSE: 8.21, MAE: 2.25, R2: 0.85, MAPE: 0.11
🏃 View run Ridge at: http://localhost:5000/#/experiments/4/runs/cc46a27ad47c4e1ba12e14cffbe13b87
🧪 View experiment at: http://localhost:5000/#/experiments/4




RandomForest run completed with RMSE: 5.02, MAE: 1.61, R2: 0.91, MAPE: 0.07
  New best model found: RandomForest
🏃 View run RandomForest at: http://localhost:5000/#/experiments/4/runs/6071b34f94b64571a52b6d402b67226e
🧪 View experiment at: http://localhost:5000/#/experiments/4




SVR run completed with RMSE: 6.80, MAE: 1.81, R2: 0.87, MAPE: 0.08
🏃 View run SVR at: http://localhost:5000/#/experiments/4/runs/f99b40a1377c4824accd10d48f3f0b04
🧪 View experiment at: http://localhost:5000/#/experiments/4


Registered model 'BestAutoMPGModel' already exists. Creating a new version of this model...


KNN run completed with RMSE: 5.40, MAE: 1.89, R2: 0.90, MAPE: 0.08
🏃 View run KNN at: http://localhost:5000/#/experiments/4/runs/7e914f9ce60d443ca8d88f1ad5a6cff7
🧪 View experiment at: http://localhost:5000/#/experiments/4

Registering the best model with RMSE: 5.02


2025/08/12 19:24:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BestAutoMPGModel, version 12
Created version '12' of model 'BestAutoMPGModel'.


Registered model: BestAutoMPGModel version 12


In [3]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")

reg_model_name = result.name
version = result.version

# Fix: Use the variable 'version' not the string "version"
model_uri = f"models:/{reg_model_name}/{version}"

loaded_model = mlflow.sklearn.load_model(model_uri)
print("Loaded Model", loaded_model)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Loaded Model Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['cylinders', 'displacement',
                                                   'horsepower', 'weight',
                                                   'acceleration', 'model_year',
                                                   'origin']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
         

In [6]:
import joblib

# ========== 8. SAVE BEST MODEL LOCALLY ==========
if 'loaded_model' in locals():
    joblib.dump(loaded_model, 'bestAutoMPGModel.pkl')
    print("Best model pipeline saved as 'bestAutoMPGModel.pkl'")
else:
    print("No best model found to save.")


Best model pipeline saved as 'bestAutoMPGModel.pkl'


In [7]:
loaded_model.predict(X_test)

array([31.13333333, 29.95666667, 20.07      , 15.        , 14.28333333,
       25.16666667, 25.73      , 12.06666667, 18.16333333, 19.5       ,
       13.73333333, 33.88666667, 27.48333333, 14.73333333, 25.16666667,
       12.2       , 30.23      , 19.41333333, 15.44666667, 35.68      ,
       23.87      , 19.13333333, 31.51333333, 28.71666667, 16.27333333,
       38.38      , 25.36666667, 24.54666667, 19.77666667, 12.33333333,
       27.25666667, 34.21      , 18.01666667, 24.40333333, 37.80666667,
       13.6       , 21.92      , 18.51      , 14.31666667, 26.06333333,
       26.08      , 28.15      , 20.9       , 11.23333333, 23.2       ,
       34.98333333, 26.24666667, 22.62      , 24.46333333, 26.11333333,
       23.16666667, 33.6       , 33.25666667, 12.26666667, 27.10666667,
       13.03333333, 16.53      , 29.92      , 23.65666667, 19.51      ,
       14.13333333, 30.95      , 24.45333333, 20.03333333, 19.06666667,
       25.54333333, 22.85      , 34.81666667, 26.21666667, 14.45