# Regression Models:

## Import Modules:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Random seed for reproducibility
seed_val = 42
np.random.seed(seed_val)

plt.style.use('fivethirtyeight')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

## Import Data:

In [3]:
df_train_import = pd.read_csv('Dataset1_train.csv')
df_train_import.head(3)

Unnamed: 0,Engine Size(L),Cylinders,Gearbox_Type,Gearbox_Number,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),Model Features_Flexible-fuel vehicle,Model Features_Four-wheel drive,...,Vehicle Class_SUBCOMPACT,Vehicle Class_SUV - SMALL,Vehicle Class_SUV - STANDARD,Vehicle Class_TWO-SEATER,Vehicle Class_VAN - CARGO,Vehicle Class_VAN - PASSENGER,Fuel Type_E,Fuel Type_X,Fuel Type_Z,CO2 Emissions(g/km)
0,4.0,6,1,5.0,14.7,10.3,12.7,22,0,0,...,0,0,0,0,0,0,0,1,0,299
1,2.4,4,1,6.0,9.8,6.7,8.4,34,0,0,...,0,0,0,0,0,0,0,1,0,193
2,2.0,4,1,8.0,11.3,7.1,9.4,30,0,0,...,1,0,0,0,0,0,0,0,1,221


In [4]:
y_train = df_train_import.pop("CO2 Emissions(g/km)")
y_train.head(3)

0    299
1    193
2    221
Name: CO2 Emissions(g/km), dtype: int64

In [5]:
X_train = df_train_import
X_train.head(3)

Unnamed: 0,Engine Size(L),Cylinders,Gearbox_Type,Gearbox_Number,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),Model Features_Flexible-fuel vehicle,Model Features_Four-wheel drive,...,Vehicle Class_STATION WAGON - SMALL,Vehicle Class_SUBCOMPACT,Vehicle Class_SUV - SMALL,Vehicle Class_SUV - STANDARD,Vehicle Class_TWO-SEATER,Vehicle Class_VAN - CARGO,Vehicle Class_VAN - PASSENGER,Fuel Type_E,Fuel Type_X,Fuel Type_Z
0,4.0,6,1,5.0,14.7,10.3,12.7,22,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2.4,4,1,6.0,9.8,6.7,8.4,34,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2.0,4,1,8.0,11.3,7.1,9.4,30,0,0,...,0,1,0,0,0,0,0,0,0,1


In [6]:
df_test = pd.read_csv('Dataset1_test.csv')
df_test.head(3)

Unnamed: 0,Engine Size(L),Cylinders,Gearbox_Type,Gearbox_Number,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),Model Features_Flexible-fuel vehicle,Model Features_Four-wheel drive,...,Vehicle Class_SUBCOMPACT,Vehicle Class_SUV - SMALL,Vehicle Class_SUV - STANDARD,Vehicle Class_TWO-SEATER,Vehicle Class_VAN - CARGO,Vehicle Class_VAN - PASSENGER,Fuel Type_E,Fuel Type_X,Fuel Type_Z,CO2 Emissions(g/km)
0,6.2,8,0,7.0,18.2,12.5,15.6,18,0,0,...,0,0,0,1,0,0,0,0,1,368
1,3.6,6,1,6.0,14.8,9.9,12.6,22,0,0,...,0,1,0,0,0,0,0,1,0,290
2,4.2,8,0,6.0,20.5,11.7,16.6,17,0,0,...,0,0,0,1,0,0,0,0,1,382


In [7]:
y_test = df_test.pop("CO2 Emissions(g/km)")
y_test.head(3)

0    368
1    290
2    382
Name: CO2 Emissions(g/km), dtype: int64

In [8]:
X_test = df_test
X_test.head(3)

Unnamed: 0,Engine Size(L),Cylinders,Gearbox_Type,Gearbox_Number,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),Model Features_Flexible-fuel vehicle,Model Features_Four-wheel drive,...,Vehicle Class_STATION WAGON - SMALL,Vehicle Class_SUBCOMPACT,Vehicle Class_SUV - SMALL,Vehicle Class_SUV - STANDARD,Vehicle Class_TWO-SEATER,Vehicle Class_VAN - CARGO,Vehicle Class_VAN - PASSENGER,Fuel Type_E,Fuel Type_X,Fuel Type_Z
0,6.2,8,0,7.0,18.2,12.5,15.6,18,0,0,...,0,0,0,0,1,0,0,0,0,1
1,3.6,6,1,6.0,14.8,9.9,12.6,22,0,0,...,0,0,1,0,0,0,0,0,1,0
2,4.2,8,0,6.0,20.5,11.7,16.6,17,0,0,...,0,0,0,0,1,0,0,0,0,1


## Scale the Data:

In [9]:
# Scale the Data
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Set Up Models and Gridsearches:

### Linear Regression Lasso

In [10]:
lrl_model_name = "Lasso_Regressor"

lrl_params = {"alpha":np.logspace(0.001,100,60)}

lrl_model = Lasso(random_state=seed_val)

lrl_gridsearch = GridSearchCV(lrl_model, lrl_params,n_jobs=-1)

### Linear Regression Ridge Regression

In [11]:
lrr_model_name = "Ridge_Regressor"

lrr_params = {"alpha":np.logspace(0.001,100,60),
             "solver":['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}

lrr_model = Ridge(random_state=seed_val)

lrr_gridsearch = GridSearchCV(lrr_model, lrr_params,n_jobs=-1)

### Linear Regression Elastic Net

In [12]:
lren_model_name = "Elastic_Net_Regressor"

lren_params = {"alpha":np.logspace(0.001,100,90),
              "l1_ratio":np.arange(0.0,1.1,0.1)}

lren_model = ElasticNet(random_state=seed_val)

lren_gridsearch = GridSearchCV(lren_model, lren_params,n_jobs=-1)

### Support Vector Machines

In [13]:
svr_model_name = "Support_Vector_Regressor"

svr_params = {"C": [0.01,1,3],
              "epsilon": [0.001,0.01,1]}

svr_model = SVR(kernel='linear')

svr_gridsearch = GridSearchCV(svr_model, svr_params,n_jobs=-1)

### Decision Tree Regressor

In [14]:
dt_model_name = "Decision_Tree_Regressor"

dt_params = {"criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
            "max_depth": [None,3,5,7,9,10,15]}

dt_model = DecisionTreeRegressor(random_state=seed_val)

dt_gridsearch = GridSearchCV(dt_model, dt_params,n_jobs=-1)

### Random Forrests Regressor

In [15]:
rf_model_name = "Random_Forrests_Regressor"

rf_params = {"n_estimators":[10,25,50,75,100],
            "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
            "max_depth": [None,3,5,7,9,10,15]}

rf_model = RandomForestRegressor(random_state=seed_val)

rf_gridsearch = GridSearchCV(rf_model, rf_params,n_jobs=-1)

In [16]:
#define  selector using random forest regressor
selector = RFE(estimator=RandomForestRegressor(n_estimators=100, random_state=42))
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

columns = df_train_import.columns
feature_ranking = dict(zip(columns, selector.ranking_))
print(f"Number of features selected: {selector.n_features_}")
# Create dataframe to view ranking
feature_ranking = pd.DataFrame(feature_ranking.items(), columns=['Feature', 'Ranking'])
feature_ranking

Number of features selected: 15


Unnamed: 0,Feature,Ranking
0,Engine Size(L),1
1,Cylinders,1
2,Gearbox_Type,1
3,Gearbox_Number,1
4,Fuel Consumption City (L/100 km),1
5,Fuel Consumption Hwy (L/100 km),1
6,Fuel Consumption Comb (L/100 km),1
7,Fuel Consumption Comb (mpg),1
8,Model Features_Flexible-fuel vehicle,3
9,Model Features_Four-wheel drive,2


## Conduct Gridsearches:

In [17]:
lrl_gridsearch.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [18]:
lrr_gridsearch.fit(X_train, y_train)

184 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
184 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hindsonjf/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hindsonjf/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hindsonjf/miniconda3/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py", line 1251, in fit
    return super().fit(X, y, sample_weight=sample_weight)
           ^^^^^^^^^^^

In [19]:
lren_gridsearch.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [20]:
svr_gridsearch.fit(X_train, y_train)

In [21]:
dt_gridsearch.fit(X_train, y_train)

In [22]:
rf_gridsearch.fit(X_train_selected, y_train)

## Compute Results and Predictions:

### Cross Validation Scores Table

In [23]:
list_model_names = [lrl_model_name,lrr_model_name,lren_model_name,svr_model_name,dt_model_name,rf_model_name]

scoring_list = ['r2','neg_mean_squared_error','neg_mean_absolute_error','neg_mean_absolute_percentage_error']

for scoring_methods in scoring_list:
    scores = np.array([
        cross_val_score(lrl_gridsearch.best_estimator_, X_train, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        cross_val_score(lrr_gridsearch.best_estimator_, X_train, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        cross_val_score(lren_gridsearch.best_estimator_, X_train, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        cross_val_score(svr_gridsearch.best_estimator_, X_train, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        cross_val_score(dt_gridsearch.best_estimator_, X_train, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        cross_val_score(rf_gridsearch.best_estimator_, X_train_selected, y_train, cv=20, scoring=scoring_methods,n_jobs=-1),
        ])
    if scoring_methods == 'r2':
        df_cv_scores = pd.DataFrame(scores, index=list_model_names)
        df_cv_scores.to_csv(f"regression_CV_{scoring_methods}_scores.csv",index=True)
    else:
        scores = scores * -1.0
        df_cv_scores = pd.DataFrame(scores, index=list_model_names)
        df_cv_scores.to_csv(f"regression_CV_{scoring_methods}_scores.csv",index=True)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [24]:
best_scores = []
test_scores = []

model_list = [lrl_gridsearch,lrr_gridsearch,lren_gridsearch,svr_gridsearch,dt_gridsearch]

for model in model_list:
    best_scores.append(model.best_estimator_.score(X_train, y_train))
    test_scores.append(model.best_estimator_.score(X_test, y_test))

# Add Random Forests: rf_gridsearch
best_scores.append(rf_gridsearch.best_estimator_.score(X_train_selected, y_train))
test_scores.append(rf_gridsearch.best_estimator_.score(X_test_selected, y_test))

dict_scores = {'Model':list_model_names,
              'Best Train Score': best_scores,
              'Test Scores': test_scores}

df_other_scores =  pd.DataFrame(data=dict_scores)
df_other_scores

Unnamed: 0,Model,Best Train Score,Test Scores
0,Lasso_Regressor,0.985582,0.985504
1,Ridge_Regressor,0.993382,0.993554
2,Elastic_Net_Regressor,0.985582,0.985504
3,Support_Vector_Regressor,0.992134,0.992234
4,Decision_Tree_Regressor,0.999515,0.995569
5,Random_Forrests_Regressor,0.999253,0.996806


### Best Estimator Predictions:

In [25]:
for model,name in zip(model_list,list_model_names[:-1]):
    d = {'y': model.predict(X_test)}
    df_predict = pd.DataFrame(data=d)
    df_predict.to_csv(f"regression_prediction_{name}.csv",index=False)

# Add Random Forests: rf_gridsearch
drf = {'y': rf_gridsearch.predict(X_test_selected)}
df_predictrf = pd.DataFrame(data=drf)
df_predictrf.to_csv(f"regression_prediction_{rf_model_name}.csv",index=False)

## Export Results:

In [26]:
df_other_scores.to_csv(f"other_regression_scores.csv",index=False)