In [None]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import os

### Data Source: https://archive.ics.uci.edu/ml/datasets/Auto+MPG

In [None]:

# reading the dataset

df = pd.read_csv('data/auto-mpg.data', sep='\n', header=None)

df[[0, 'car_name']] = df[0].str.split('\t', expand=True)

df.head()


In [None]:

# defining column names
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']

# refining the dataframe
df[columns] = df[0].str.split(expand=True)
df.drop(columns=[0], inplace=True)
df['car_name'] = df['car_name'].apply(lambda x: x.replace('"', ''))

df.head()

In [None]:
# converting columns to float type
for col in df.columns:
    if col not in ['mpg', 'car_name']:
        df = df[pd.to_numeric(df[col], errors='coerce').notnull()]
        df[col] = df[col].astype(float)

# seperating dependant and independant variables
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']]
y = df['mpg']

In [None]:
# train test split
train_X, test_X, train_y, test_y = train_test_split(X,y,random_state=0)

In [None]:
# function to evaluate model performance
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    return rmse, mae

## The conventional way

In [None]:
alpha, l1_ratio = 0.01, 0.15
    
# initiating an elastic net model
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

# fitting the model with train dataset
lr.fit(train_X, train_y)

# making predictions on test set
y_pred = lr.predict(test_X)

# obtaining the model performance
rmse, mae = eval_metrics(test_y, y_pred)

print('Hyperparameters: Alpha =  {}, L1 Ratio = {} \n'.format(alpha, l1_ratio))

print('Model Performance on test set: RMSE = {}, MAE = {} \n'.format(rmse, mae))

In [None]:
alphas, l1_ratios = [0.01, 0.02, 0.5], [0.15, 0.2, 0.5]
    
for alpha in alphas:
    for l1_ratio in l1_ratios:
        # initiating an elastic net model
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

        # fitting the model with train dataset
        lr.fit(train_X, train_y)

        # making predictions on test set
        y_pred = lr.predict(test_X)

        # obtaining the model performance
        rmse, mae = eval_metrics(test_y, y_pred)

        print('Hyperparameters: Alpha =  {}, L1 Ratio = {} \n'.format(alpha, l1_ratio))

        print('Model Performance on test set: RMSE = {}, MAE = {} \n'.format(rmse, mae))
        
        print ('-'*50,'\n')

## Using mlflow

In [None]:
# defining a new experiment
experiment_name = 'PlainRegression'
# returns experiment ID
try:
    # creating a new experiment
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [None]:
if 'images' not in os.listdir():
    os.mkdir('images')

with mlflow.start_run(experiment_id=exp_id):
    
    # simulating EDA process by creating distribution plots for all the features
    train_X.plot(kind='box', subplots=True, layout=(2,4), figsize=(16,9), title='Box plot of each feature')
    
    # saving the image to images folder
    plt.savefig('images/distribution_plot_all_features.png')

    # logging artifacts -> saves the image and enables tracking for later use
    mlflow.log_artifacts('images')
    
    # defining alpha and l1 ratio
    alpha, l1_ratio = 0.02, 0.15
    
    # initiating an elastic net model
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

    # fitting the model with train dataset
    lr.fit(train_X, train_y)

    # making predictions on test set
    y_pred = lr.predict(test_X)

    # obtaining the model performance
    rmse, mae = eval_metrics(test_y, y_pred)
    
    # logging the parameters 
    mlflow.log_param('alpha', alpha)
    mlflow.log_param('l1_ratio', l1_ratio)
    
    # logging the metrics
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    
    # saving the model for later use
    mlflow.sklearn.log_model(lr, "PlainRegression_Model")
    

## Hyperparameter Tuning using mlflow

In [None]:
# defining a new experiment
experiment_name = 'PlainRegression_HyperParameter_Search'
# returns experiment ID
try:
    # creating a new experiment
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [None]:
# defining alpha and l1 ratio
alphas, l1_ratios = [0.01, 0.05, 0.1, 0.02, 0.03], [0.15, 0.1, 0.2, 0.25]

for alpha in alphas:
    for l1_ratio in l1_ratios:
        # starting an mlflow run, and tracking them under the experiment defined above
        with mlflow.start_run(experiment_id=exp_id):

            # logging artifacts -> saves the image and enables tracking for later use
            mlflow.log_artifacts('images')

            # initiating an elastic net model
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

            # fitting the model with train dataset
            lr.fit(train_X, train_y)

            # making predictions on test set
            y_pred = lr.predict(test_X)

            # obtaining the model performance
            rmse, mae = eval_metrics(test_y, y_pred)

            # logging hyperparameters defined above
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("l1_ratio", l1_ratio)

            # logging performance of the model
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
    