In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer

# for regression models
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

#performance
from sklearn.metrics import mean_squared_error

# To supress warnings
import warnings
warnings.filterwarnings("ignore")

In [160]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlops-project")

<Experiment: artifact_location='/home/tapji/.conda/mlops-project/mlruns/1', creation_time=1689852646451, experiment_id='1', last_update_time=1689852646451, lifecycle_stage='active', name='mlops-project', tags={}>

### Importing dataset

In [117]:
def read_dataframe(filename: str):
    df = pd.read_csv(filename, encoding='iso-8859-1')
    
    # Remove '£' and ',' sign from 'Total Cost/ 10000 miles' column and convert to numeric values
    df['Annual fuel Cost 10000 Miles'] = df['Annual fuel Cost 10000 Miles'].str.replace('£', '')
    df['Annual fuel Cost 10000 Miles'] = df['Annual fuel Cost 10000 Miles'].str.replace(',', '').astype(int)
    df['Annual Electricity cost / 10000 miles'] = df['Annual Electricity cost / 10000 miles'].str.replace('£', '')
    df['Annual Electricity cost / 10000 miles'] = df['Annual Electricity cost / 10000 miles'].str.replace(',', '').astype(int)
    df['Total cost / 10000 miles'] = df['Total cost / 10000 miles'].str.replace('£', '')
    df['Total cost / 10000 miles'] = df['Total cost / 10000 miles'].str.replace(',', '').astype(int)

    # dropping irrelevant features
    df.drop(['Manufacturer', 'Model', 'Description','Transmission', 'Engine Power (Kw)', 'Engine Power (PS)',
      'Electric energy consumption Miles/kWh', 'wh/km', 'Diesel VED Supplement', 'Testing Scheme', 'Euro Standard', 'Maximum range (Miles)',
      'WLTP Imperial Low', 'WLTP Imperial Medium', 'WLTP Imperial High','WLTP Imperial Extra High', 'WLTP Imperial Combined',
      'WLTP Imperial Combined (Weighted)', 'WLTP Metric Low','WLTP Metric Medium', 'WLTP Metric High', 'WLTP Metric Extra High',
      'WLTP Metric Combined', 'WLTP Metric Combined (Weighted)','WLTP CO2 Weighted', 'Equivalent All Electric Range Miles', 'Equivalent All Electric Range KM',
      'THC Emissions [mg/km]', 'Electric Range City Miles', 'RDE NOx Urban', 'Powertrain', 'Annual fuel Cost 10000 Miles', 'Electric Range City Km', 'Noise Level dB(A)',
      'RDE NOx Combined', 'Emissions CO [mg/km]', 'Emissions NOx [mg/km]', 'THC + NOx Emissions [mg/km]', 'Annual Electricity cost / 10000 miles', 'Maximum range (Km)'], axis=1, inplace=True)

    # inputting missing values
    columns_to_impute = ['WLTP CO2', 'Particulates [No.] [mg/km]']
    
    imputer = SimpleImputer(strategy='mean')
    df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

   # Converting object data type to categorical

    for feature in df.columns: 
      if df[feature].dtype == 'object': 
         df[feature] = pd.Categorical(df[feature])# Replace strings with an integer

    # Renaming columns
    
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

    return df


In [118]:
df = read_dataframe('./data/emission_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Engine_Capacity             4625 non-null   int64   
 1   Fuel_Type                   4625 non-null   category
 2   WLTP_CO2                    4625 non-null   float64 
 3   Total_cost_/_10000_miles    4625 non-null   int64   
 4   Particulates_[No.]_[mg/km]  4625 non-null   float64 
dtypes: category(1), float64(2), int64(2)
memory usage: 149.5 KB


In [119]:
def train_test(df):
    # independent variables
    X = df.drop(["WLTP_CO2"], axis=1)
    # dependent variable 
    y = df["WLTP_CO2"]

    # Adding intercept to the dataset
    X = pd.get_dummies(X, drop_first=True)   
    X = sm.add_constant(X)
    
    # Splitting X and y into train and test sets in a 70:30 ratio
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=1
    )
    return X_train, X_test, y_train, y_test

In [120]:
X_train, X_test, y_train, y_test = train_test(df)

In [121]:
# Training the model

def ols_regression(X_train, X_test, y_train, y_test): 
    
    olsmod = sm.OLS(y_train, X_train)
    olsres = olsmod.fit()
    
    # Identify columns in the training and test dataset
    train_columns = set(X_train.columns)
    test_columns = set(X_test.columns)
    
    # Compare columns training and testing dataset columns
    columns_to_drop = test_columns - train_columns
    
    # Drop columns from the test dataset
    X_test.drop(columns=columns_to_drop, inplace=True)

    # Making predictions on the test set
    y_pred = olsres.predict(X_test)
    rmse1 = np.sqrt(mean_squared_error(y_train, olsres.fittedvalues))
    rmse2 = np.sqrt(mean_squared_error(y_test, y_pred))

    # return the regression summary as a string
    return rmse1, rmse2, olsres

In [122]:
rmse1, rmse2, olres = ols_regression(X_train, X_test, y_train, y_test)

In [123]:
print(rmse1, rmse2)

17.613735102497987 13.022063151691114


In [124]:
print(olres.summary())

                            OLS Regression Results                            
Dep. Variable:               WLTP_CO2   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     3657.
Date:                Fri, 21 Jul 2023   Prob (F-statistic):               0.00
Time:                        09:19:01   Log-Likelihood:                -13879.
No. Observations:                3237   AIC:                         2.778e+04
Df Residuals:                    3226   BIC:                         2.785e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Engine_Capacity             4625 non-null   int64   
 1   Fuel_Type                   4625 non-null   category
 2   WLTP_CO2                    4625 non-null   float64 
 3   Total_cost_/_10000_miles    4625 non-null   int64   
 4   Particulates_[No.]_[mg/km]  4625 non-null   float64 
dtypes: category(1), float64(2), int64(2)
memory usage: 149.5 KB


In [131]:
def lasso_prep(df):
    
    # Splitting X and y into train and test sets in a 70:30 ratio
     # independent variables
    X = df.drop(["WLTP_CO2"], axis=1)
    # dependent variable 
    y = df["WLTP_CO2"]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=1)
    
    categorical = ['Fuel_Type']
    numerical = ['Engine_Capacity', 'Total_cost_/_10000_miles', 'Particulates_[No.]_[mg/km]']
    dv = DictVectorizer()
    train_dicts = X_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    test_dicts = X_test[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)

    return X_train, X_test, y_train, y_test

In [132]:
X_train, X_test, y_train, y_test = lasso_prep(df)

In [157]:
def lasso_regression(X_train, X_test, y_train, y_test):
    
    target = 'WLTP_CO2'
    y_train = y_train.values.ravel()  # Convert target to a 1-dimensional array
    y_test = y_test.values.ravel()    # Convert target to a 1-dimensional array

    lr = Lasso(alpha)  # Assuming you want to use a specific alpha value

    # Fit the Lasso model to the training data
    lr.fit(X_train, y_train)

    # Making predictions on the test set
    y_pred = lr.predict(X_test)

    # Calculate RMSE for training set
    rmse3 = np.sqrt(mean_squared_error(y_train, lr.predict(X_train)))

    # Calculate RMSE for test set
    rmse4 = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse1, rmse2, lr

In [158]:
rmse3, rmse4, lr = lasso_regression(X_train, X_test, y_train, y_test)

In [159]:
print(rmse1, rmse2)

17.615890600183416 13.036524382241952
