In [18]:
import pandas as pd
df = pd.read_csv("./auto-mpg.csv")
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [24]:
!pip install mlflow





In [19]:
print(df.isna().sum())

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64


In [12]:
#Number of records
row, col = df.shape
print("number of records", row)
print("Number of columns", col)

number of records 398
Number of columns 9


In [13]:
print("Missing values per column")
print(df.isna().sum())

#Data type of column
print("Data type of each field")
print(df.info())


Missing values per column
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64
Data type of each field
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB
None


In [28]:
import numpy as np
import pandas as pd

# Replace '?' with NaN
df.replace("?", np.nan, inplace=True)


In [29]:
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
df['horsepower'] = df['horsepower'] .fillna(df['horsepower'].median())

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 28.1+ KB


In [30]:
data = df.drop("car name",axis =1)
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1
394,44.0,4,97.0,52.0,2130,24.6,82,2
395,32.0,4,135.0,84.0,2295,11.6,82,1
396,28.0,4,120.0,79.0,2625,18.6,82,1


In [31]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Features and target
X = data.drop('mpg', axis=1)
y = data['mpg']

# Preprocessing
numeric_features = ['displacement','horsepower','weight','acceleration']
categorical_features = ['cylinders','model_year','origin']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# Models to try
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    'KNeighbors': KNeighborsRegressor()
}



In [33]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

mlflow.set_experiment("AutoMPG_Regression")

best_rmse = float('inf')
best_model_name = None
best_model = None
best_run_id = None 
for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        run_id = run.info.run_id
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)
        print(f"{name} - RMSE: {rmse:.3f}, MAE: {mae:.3f}")
        
        mlflow.log_param("model_name", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, name="model")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_name = name
            best_model = model
            best_run_id = run_id 

LinearRegression - RMSE: 2.863, MAE: 2.255




RandomForest - RMSE: 2.141, MAE: 1.577




GradientBoosting - RMSE: 2.347, MAE: 1.748




SVR - RMSE: 3.706, MAE: 2.791




Decision Tree - RMSE: 3.337, MAE: 2.223




KNeighbors - RMSE: 3.547, MAE: 2.769




In [36]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

mlflow.set_tracking_uri("http://localhost:5000")  
mlflow.set_experiment("AutoMPG_Regression")

best_rmse = float('inf')
best_model_name = None
best_model = None
best_run_id = None

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        run_id = run.info.run_id

        # Fit
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)

        print(f"{name} - RMSE: {rmse:.3f}, MAE: {mae:.3f}")

        # Log parameters & metrics
        mlflow.log_param("model_name", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)

        # Infer model signature
        signature = infer_signature(X_train, model.predict(X_train))

        # Log model
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            signature=signature
        )

        # Track best
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_name = name
            best_model = model
            best_run_id = run_id

# ---------- Register Best Model ----------
from mlflow.tracking import MlflowClient
client = MlflowClient()

model_uri = f"runs:/{best_run_id}/model"
model_name = "AutoMPG_BestModel"

model_version = mlflow.register_model(model_uri, model_name)
print(f"Best model ({best_model_name}) registered as {model_name}, version {model_version.version}")


2025/08/13 10:00:39 INFO mlflow.tracking.fluent: Experiment with name 'AutoMPG_Regression' does not exist. Creating a new experiment.


LinearRegression - RMSE: 2.863, MAE: 2.255




🏃 View run LinearRegression at: http://localhost:5000/#/experiments/2/runs/8696c3a0fb4c4c21b4d6e3fe64b07d48
🧪 View experiment at: http://localhost:5000/#/experiments/2
RandomForest - RMSE: 2.141, MAE: 1.577




🏃 View run RandomForest at: http://localhost:5000/#/experiments/2/runs/f5aae456a5824495a1784416219120a6
🧪 View experiment at: http://localhost:5000/#/experiments/2




GradientBoosting - RMSE: 2.347, MAE: 1.748




🏃 View run GradientBoosting at: http://localhost:5000/#/experiments/2/runs/3fbf1980544f4975ad94eb3813b25e04
🧪 View experiment at: http://localhost:5000/#/experiments/2
SVR - RMSE: 3.706, MAE: 2.791




🏃 View run SVR at: http://localhost:5000/#/experiments/2/runs/1e59e606fd1240eba1a49f60376d6bad
🧪 View experiment at: http://localhost:5000/#/experiments/2
Decision Tree - RMSE: 3.337, MAE: 2.223




🏃 View run Decision Tree at: http://localhost:5000/#/experiments/2/runs/34122403be2c470faf6c4accf56d354f
🧪 View experiment at: http://localhost:5000/#/experiments/2
KNeighbors - RMSE: 3.547, MAE: 2.769


Successfully registered model 'AutoMPG_BestModel'.


🏃 View run KNeighbors at: http://localhost:5000/#/experiments/2/runs/da3dca29db8c44688c3050a6a8f3e0d3
🧪 View experiment at: http://localhost:5000/#/experiments/2


2025/08/13 10:01:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: AutoMPG_BestModel, version 1
Created version '1' of model 'AutoMPG_BestModel'.


Best model (RandomForest) registered as AutoMPG_BestModel, version 1


In [39]:
import pickle

# Function to save model locally
def save_model_locally(model, filename):
    with open(filename, 'wb') as f_out:
        pickle.dump(model, f_out)

save_model_locally(best_model, "lrmodel.bin")
print(f"Best model saved locally as lrmodel.bin")


Best model saved locally as lrmodel.bin
