<a href="https://colab.research.google.com/github/vglykos/Cars-RFE-regression/blob/main/Cars_rfe_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Author Notes

---


##1) I modified the following Kaggle notebook
###https://www.kaggle.com/code/jnikhilsai/cross-validation-with-linear-regression/notebook
##2) Although the code is neat (good job Nikhil Sai), I found a couple of errors (data leak) which I fixed.
##3) I also tried more regression models and tuned their parameters with GridSearchCV
##4) An interesting point is how to wrap the RFE selector around the model (lr and ridge).
##5) I didn't use RFE with Lasso because Lasso reguralizes with feature selection (To be honest I tried RFE with Lasso and i got almost the same results with Lasso alone. I didn't include it in this notebook).
##6) I didnt change anything in the feature engineering code. This is copy-paste.
##7) The code was written in Google Colab and the data were imported from my goodle Drive. 

In [None]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale, RobustScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import plot_confusion_matrix

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [None]:
#mount to gDrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#load data
cars = pd.read_csv('/content/drive/My Drive/Colab Notebooks/CV-LR-Housing_Cars-Kaggle/CarPrice_Assignment.csv')

In [None]:
cars.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [None]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

##data preparation

In [None]:
# All data preparation steps in this cell

# converting symboling to categorical
cars['symboling'] = cars['symboling'].astype('object')
cars.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [None]:
# create new column: car_company
p = re.compile(r'\w+-?\w+')
cars['car_company'] = cars['CarName'].apply(lambda x: re.findall(p, x)[0])
cars.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,car_company
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0,audi
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0,audi


In [None]:
# replacing misspelled car_company names
# volkswagen
cars.loc[(cars['car_company'] == "vw") | 
         (cars['car_company'] == "vokswagen")
         , 'car_company'] = 'volkswagen'
# porsche
cars.loc[cars['car_company'] == "porcshce", 'car_company'] = 'porsche'
# toyota
cars.loc[cars['car_company'] == "toyouta", 'car_company'] = 'toyota'
# nissan
cars.loc[cars['car_company'] == "Nissan", 'car_company'] = 'nissan'
# mazda
cars.loc[cars['car_company'] == "maxda", 'car_company'] = 'mazda'

In [None]:
# drop carname variable
cars = cars.drop('CarName', axis=1)

In [None]:
# split into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]
y = cars['price']

In [None]:
# creating dummy variables for categorical variables
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem,car_company
0,3,gas,std,two,convertible,rwd,front,dohc,four,mpfi,alfa-romero
1,3,gas,std,two,convertible,rwd,front,dohc,four,mpfi,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,alfa-romero
3,2,gas,std,four,sedan,fwd,front,ohc,four,mpfi,audi
4,2,gas,std,four,sedan,4wd,front,ohc,five,mpfi,audi


In [None]:
# convert into dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

Unnamed: 0,symboling_-1,symboling_0,symboling_1,symboling_2,symboling_3,fueltype_gas,aspiration_turbo,doornumber_two,carbody_hardtop,carbody_hatchback,...,car_company_nissan,car_company_peugeot,car_company_plymouth,car_company_porsche,car_company_renault,car_company_saab,car_company_subaru,car_company_toyota,car_company_volkswagen,car_company_volvo
0,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# drop categorical variables 
X = X.drop(list(cars_categorical.columns), axis=1)


# concat dummy variables with X
X = pd.concat([X, cars_dummies], axis=1)

#Now we create and fit the models

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=20)

In [None]:
#I wrote this code because many times we process certain columns.
#Here though, we scaled all columns
cols = X_train.columns
transformer = Pipeline(steps=[
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('trans', transformer, cols)
    ])

preprocessor

ColumnTransformer(transformers=[('trans',
                                 Pipeline(steps=[('scaler', RobustScaler())]),
                                 Index(['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
       'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'symboling_-1', 'symboling_0',
       'symboling_1', 'symboling_2', 'symboling_3', 'fueltype_gas',
       'aspiration_t...
       'car_company_dodge', 'car_company_honda', 'car_company_isuzu',
       'car_company_jaguar', 'car_company_mazda', 'car_company_mercury',
       'car_company_mitsubishi', 'car_company_nissan', 'car_company_peugeot',
       'car_company_plymouth', 'car_company_porsche', 'car_company_renault',
       'car_company_saab', 'car_company_subaru', 'car_company_toyota',
       'car_company_volkswagen', 'car_company_volvo'],
      dtype='object'))])

In [None]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 10)

#GridSearchCV with multiple regression models (LinearRegression, Ridge and Lasso)

In [None]:
#Create dedicated pipelines for each model

#linear regression
lr = RFE(estimator= LinearRegression())

pipe_lr = Pipeline(
    steps = [
        ("scaler", RobustScaler()),
        ("rfe", lr)
    ]
)


#ridge regression
ridge = RFE(estimator= Ridge())

pipe_ridge = Pipeline(
    steps = [
        ("scaler", RobustScaler()),
        ("rfe", ridge)
    ]
)


#lasso (No RFE)
pipe_lasso = Pipeline(
    steps = [
        ("scaler", RobustScaler()),
        ("lasso", Lasso())
    ]
)



In [None]:
#Define the parameters for each gridSearchCV
lr_params = {
    "rfe__step" : list(range(2, 40))
}

ridge_params = {
    "rfe__step"                : list(range(1, 40)),
    "rfe__estimator__alpha"    : [0.1, 1, 10, 100, 200],
    "rfe__estimator__max_iter" : [100000]
}

lasso_params = {
    "lasso__alpha"    : [0.1, 1, 9, 10, 15, 20, 30, 40, 50, 100],
    "lasso__max_iter" : [100000]
}

In [None]:
# Instantiate models
#Below I have the code for gridSearchCV and RandomSearchCV. The second takes less time to run and 99% of the time produces the same results with gridSearchCV

#Instantiate RandomizedSearchCV
lr_grid_search = RandomizedSearchCV(estimator = pipe_lr, param_distributions= lr_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  

ridge_grid_search = RandomizedSearchCV(estimator = pipe_ridge, param_distributions= ridge_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  

lasso_grid_search = RandomizedSearchCV(estimator = pipe_lasso, param_distributions= lasso_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  

# #Instantiate GridSearchCV
# lr_grid_search = GridSearchCV(estimator = pipe_lr, param_grid= lr_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  

# ridge_grid_search = GridSearchCV(estimator = pipe_ridge, param_grid= ridge_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  

# lasso_grid_search = GridSearchCV(estimator = pipe_lasso, param_grid= lasso_params, scoring= 'r2', cv = folds, verbose = 1, n_jobs=-1)  



#Fit models
grids = [lr_grid_search, ridge_grid_search, lasso_grid_search]
for pipe in grids:
    pipe.fit(X_train,y_train)


#print results
grid_dict = {0: 'Scale-RFE-LinearRegression', 
             1: 'Scale-RFE-Ridge', 
             2: 'Scale-Lasso'}

for i, model in enumerate(grids):
    print('{} Training set: {:.3f}\n'.format(grid_dict[i],
    model.score(X_train,y_train)))
    print('{} Testing set Accuracy: {:.3f}\n'.format(grid_dict[i],
    model.score(X_test,y_test)))
    print('{} Best Params: {}\n\n\n'.format(grid_dict[i],          model.best_params_))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Scale-RFE-LinearRegression Training set: 0.965

Scale-RFE-LinearRegression Testing set Accuracy: 0.767

Scale-RFE-LinearRegression Best Params: {'rfe__step': 8}



Scale-RFE-Ridge Training set: 0.957

Scale-RFE-Ridge Testing set Accuracy: 0.845

Scale-RFE-Ridge Best Params: {'rfe__step': 31, 'rfe__estimator__max_iter': 100000, 'rfe__estimator__alpha': 1}



Scale-Lasso Training set: 0.967

Scale-Lasso Testing set Accuracy: 0.852

Scale-Lasso Best Params: {'lasso__max_iter': 100000, 'lasso__alpha': 9}



