# Regression models

No apply Tuning parameters 

## 1. Importing the libraries

In [1]:
import numpy as np
np.set_printoptions(threshold=0)
import matplotlib.pyplot as plt
import pandas as pd

## 2. Importing the dataset

In [2]:
dataset = pd.read_csv('../dataSet/main_data.csv')
dataset

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,GT86,2016,Manual,24089,Petrol,265.0,36.2,2.0,16000
1,GT86,2017,Manual,18615,Petrol,145.0,36.2,2.0,15995
2,GT86,2015,Manual,27469,Petrol,265.0,36.2,2.0,13998
3,GT86,2017,Manual,14736,Petrol,150.0,36.2,2.0,18998
4,GT86,2017,Manual,36284,Petrol,145.0,36.2,2.0,17498
...,...,...,...,...,...,...,...,...,...
6733,IQ,2011,Automatic,30000,Petrol,20.0,58.9,1.0,5500
6734,Urban Cruiser,2011,Manual,36154,Petrol,125.0,50.4,1.3,4985
6735,Urban Cruiser,2012,Manual,46000,Diesel,125.0,57.6,1.4,4995
6736,Urban Cruiser,2011,Manual,60700,Petrol,125.0,50.4,1.3,3995


#### we can see clearly that indexes at 0, 2, and 4 are object

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6738 entries, 0 to 6737
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         6738 non-null   object 
 1   year          6738 non-null   int64  
 2   transmission  6738 non-null   object 
 3   mileage       6738 non-null   int64  
 4   fuelType      6738 non-null   object 
 5   tax           6738 non-null   float64
 6   mpg           6738 non-null   float64
 7   engineSize    6738 non-null   float64
 8   price         6738 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 473.9+ KB


### 2.1 check the shape

In [4]:
dataset.shape

(6738, 9)

### 2.4 Get independent/feature variables

In [5]:
X = dataset.iloc[:, :-1].values
print(X)
print(X.shape)

[[' GT86' 2016 'Manual' ... 265.0 36.2 2.0]
 [' GT86' 2017 'Manual' ... 145.0 36.2 2.0]
 [' GT86' 2015 'Manual' ... 265.0 36.2 2.0]
 ...
 [' Urban Cruiser' 2012 'Manual' ... 125.0 57.6 1.4]
 [' Urban Cruiser' 2011 'Manual' ... 125.0 50.4 1.3]
 [' Urban Cruiser' 2011 'Manual' ... 125.0 50.4 1.3]]
(6738, 8)


In [6]:
type(X)

numpy.ndarray

### 2.5 Get dependent variables

In [7]:
y = dataset.iloc[:, -1].values
print(y)
print(y.shape)

[16000 15995 13998 ...  4995  3995  4495]
(6738,)


## 3. Encoding the categorical variables

### source: https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

We are using OneHotEncoder/ColumnTransformer to create Dummy Variables

A dummy variable is a variable created to assign numerical value to levels of categorical variables.

Dummy variables are variables that are either 0 or 1

In [8]:
print(X)
print(X.shape)
print(type(X))

[[' GT86' 2016 'Manual' ... 265.0 36.2 2.0]
 [' GT86' 2017 'Manual' ... 145.0 36.2 2.0]
 [' GT86' 2015 'Manual' ... 265.0 36.2 2.0]
 ...
 [' Urban Cruiser' 2012 'Manual' ... 125.0 57.6 1.4]
 [' Urban Cruiser' 2011 'Manual' ... 125.0 50.4 1.3]
 [' Urban Cruiser' 2011 'Manual' ... 125.0 50.4 1.3]]
(6738, 8)
<class 'numpy.ndarray'>


### Export X features for deploying later

In [9]:
# Run only 1 time

# X_deploy = pd.DataFrame(X)
# X_deploy.to_csv('X_deploy.csv', index=False)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2, 4])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))
X = ct.fit_transform(X).toarray()

In [11]:
dataset['model'].unique()

array([' GT86', ' Corolla', ' RAV4', ..., ' Verso-S', ' IQ',
       ' Urban Cruiser'], dtype=object)

 #### Example when encoding successfully
 
 0.0 0.0 0.0 0.0 0.0 0.0 1.0 equivalent to "GT86"
 
 0.0 0.0 0.0 0.0 0.0 1.0 0.0 equivalent to "Corolla"

In [12]:
print(X)
print(X.shape)
print(type(X))

[[  0.    0.    0.  ... 265.   36.2   2. ]
 [  0.    0.    0.  ... 145.   36.2   2. ]
 [  0.    0.    0.  ... 265.   36.2   2. ]
 ...
 [  0.    0.    0.  ... 125.   57.6   1.4]
 [  0.    0.    0.  ... 125.   50.4   1.3]
 [  0.    0.    0.  ... 125.   50.4   1.3]]
(6738, 31)
<class 'numpy.ndarray'>


## 4. Splitting the dataset into the Training set and Test set

test set = 20% of dataset

random_state=None: we get different train and test sets across different executions and the shuffling process is out of control. 

random_state=0 , we get the same train and test sets across different executions.

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = None)

In [14]:
print(X_train)
print(X_train.shape)

[[  0.    0.    1.  ... 145.   55.4   1. ]
 [  0.    0.    0.  ... 140.   49.6   2.5]
 [  0.    0.    0.  ... 150.   58.9   1.5]
 ...
 [  0.    0.    0.  ... 145.   47.9   1.5]
 [  0.    0.    0.  ...  30.   55.    1.3]
 [  0.    0.    0.  ... 140.   49.6   2.5]]
(5390, 31)


In [15]:
print(X_test)
print(X_test.shape)

[[  0.    0.    0.  ... 145.   47.9   1.5]
 [  0.    0.    0.  ... 135.   74.3   1.8]
 [  1.    0.    0.  ...   0.   78.5   1.8]
 ...
 [  0.    0.    1.  ... 145.   56.5   1. ]
 [  0.    0.    0.  ... 145.   47.9   1.5]
 [  0.    0.    0.  ...   0.   86.    1.5]]
(1348, 31)


In [16]:
print(y_train)
print(y_train.shape)
type(y_train)

[10491 30480 10498 ... 12426  8764 33950]
(5390,)


numpy.ndarray

In [17]:
print(y_test)
print(y_test.shape)

[16995 23995 12999 ...  8995 11949 12088]
(1348,)


### 4.1 Save train data for deploy

In [18]:
# Run only for 1 time

# X_train_deploy = pd.DataFrame(X_train)
# y_train_deploy = pd.DataFrame(y_train)

# X_train_deploy.to_csv('X_train_deploy.csv', index=False)
# y_train_deploy.to_csv('y_train_deploy.csv', index=False)

### 4.2 Save test data for deploy

In [19]:
# run only for 1 time

# X_test_deploy = pd.DataFrame(X_test)
# y_test_deploy = pd.DataFrame(y_test)

# X_test_deploy.to_csv('X_test_deploy.csv', index=False)
# y_test_deploy.to_csv('y_test_deploy.csv', index=False)

## -- Feature Scaling for SVR --
### source: https://towardsdatascience.com/all-about-feature-scaling-bcc0ad75cb35

In [20]:
#convert 1D to 2D array for transform
y_train_SVR = y_train.reshape(len(y_train),1)
y_test_SVR = y_test.reshape(len(y_test),1)

print(y_train_SVR)
print(y_test_SVR)

[[10491]
 [30480]
 [10498]
 ...
 [12426]
 [ 8764]
 [33950]]
[[16995]
 [23995]
 [12999]
 ...
 [ 8995]
 [11949]
 [12088]]


In [21]:
from sklearn.preprocessing import StandardScaler
sc_X_SVR = StandardScaler()
sc_y_SVR = StandardScaler()
X_train_SVR = sc_X_SVR.fit_transform(X_train)
y_train_SVR = sc_y_SVR.fit_transform(y_train_SVR)

print(y_train_SVR)

[[-0.31780898]
 [ 2.81658607]
 [-0.31671134]
 ...
 [-0.01438938]
 [-0.58861294]
 [ 3.36070288]]


## 5. Train and Build model

### 5.1 Multiple Linear Regression approach
### y = b0 + b1x1 + b2x2 + ... + bNxN

### source: https://en.wikipedia.org/wiki/Linear_regression#Simple_and_multiple_linear_regression
### source: https://www.simplilearn.com/what-is-backward-elimination-technique-in-machine-learning-article#:~:text=What%20is%20backward%20elimination%20in,is%20removed%20from%20the%20model.
### P value: https://www.investopedia.com/terms/p/p-value.asp, https://www.simplypsychology.org/p-value.html


Using 1 of 5 methods to find the best independent/features variable under the hood
    
    - All in
    - backward elimination
    - forward elimination
    - bidirectional elimination
    - Score comparision

Using Ordinary Least Squares Algorithms to find the best linear regression
    

Training the Multiple Linear Regression model on the Training set

In [22]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)

LinearRegression()

### 5.2 Support Vector Regression (SVR) approach 

### source: https://files.core.ac.uk/pdf/2612/81523322.pdf

### source: https://data-flair.training/blogs/svm-kernel-functions/

Training the Polynomial Regression model on the whole dataset

In [23]:
from sklearn.svm import SVR
regressor_SVR = SVR(kernel = 'rbf')
regressor_SVR.fit(X_train_SVR, y_train_SVR)

  y = column_or_1d(y, warn=True)


SVR()

### 5.3 Decision Tree Regression approach

### source: https://www.section.io/engineering-education/entropy-information-gain-machine-learning/

### source: https://towardsdatascience.com/entropy-how-decision-trees-make-decisions-2946b9c18c8

### source: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

In [24]:
from sklearn.tree import DecisionTreeRegressor
regressor_Decision_Tree = DecisionTreeRegressor(random_state = None)
regressor_Decision_Tree.fit(X_train, y_train)

DecisionTreeRegressor()

### 5.4 Random Forest Regression approach

Step 1: Pick at random K data points from the Training set.

Step 2: Build the Decision Tree associated to these K data points.

Step 3: Choose the number Ntree of trees you want to build and repeat STEPS 1 & 2

Step 4: For a new data point, make each one of your Ntree trees predict the value of Y for the data point in question, and assign the new data point the average across all of the predicted Y values.

### source: https://towardsdatascience.com/basic-ensemble-learning-random-forest-adaboost-gradient-boosting-step-by-step-explained-95d49d1e2725

### source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [25]:
from sklearn.ensemble import RandomForestRegressor
regressor_random_forest = RandomForestRegressor(n_estimators = 10, random_state = None)
regressor_random_forest.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10)

## 6. Predicting the Test set results

In [26]:
#print options
np.set_printoptions(threshold=np.inf)

### 6.1 Multiple Linear Regression results

In [27]:
y_pred_linear = regressor_linear.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_linear.reshape(len(y_pred_linear),1), y_test.reshape(len(y_test),1)),1))

[[14031.49 16995.  ]
 [22936.9  23995.  ]
 [14211.01 12999.  ]
 [12054.97 11295.  ]
 [22212.88 19400.  ]
 [ 8559.42  7000.  ]
 [12613.55 12000.  ]
 [20915.95 19997.  ]
 [ 9809.92  8790.  ]
 [10690.97 10495.  ]
 [10116.47  9995.  ]
 [18892.91 15490.  ]
 [ 5035.78  5999.  ]
 [22398.76 22075.  ]
 [ 7641.55  7949.  ]
 [ 3894.51  4990.  ]
 [14190.85 13295.  ]
 [ 9968.16 10791.  ]
 [ 9110.59  7999.  ]
 [ 9993.84  9495.  ]
 [ 7914.3   6298.  ]
 [20628.01 22975.  ]
 [10926.26 12500.  ]
 [15596.82 15995.  ]
 [18440.75 17995.  ]
 [ 6935.84  5899.  ]
 [12467.34 12795.  ]
 [13658.8  15369.  ]
 [22672.91 22498.  ]
 [11202.46 10995.  ]
 [ 6821.62  6720.  ]
 [ 9028.52  7990.  ]
 [11261.52 10493.  ]
 [13385.84 12790.  ]
 [ 6649.46  6430.  ]
 [ 8856.73  7758.  ]
 [10175.08  9795.  ]
 [22502.94 20689.  ]
 [16481.56 15450.  ]
 [ 8649.44  8000.  ]
 [ 4972.99  5498.  ]
 [ 9689.04  9995.  ]
 [ 7268.17  6190.  ]
 [ 8182.97  8000.  ]
 [12601.68 12771.  ]
 [ 3635.73  6500.  ]
 [ 5721.87  5850.  ]
 [22370.62 23

### 6.2 Support Vector Regression (SVR) results

In [28]:
y_pred_SVR = sc_y_SVR.inverse_transform(regressor_SVR.predict(sc_X_SVR.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_SVR.reshape(len(y_pred_SVR),1), y_test.reshape(len(y_test_SVR),1)),1))

[[14123.6  16995.  ]
 [23691.69 23995.  ]
 [13684.49 12999.  ]
 [12179.81 11295.  ]
 [20897.01 19400.  ]
 [ 8074.09  7000.  ]
 [12541.1  12000.  ]
 [20214.47 19997.  ]
 [ 9443.94  8790.  ]
 [10247.23 10495.  ]
 [ 9787.66  9995.  ]
 [17111.44 15490.  ]
 [ 5887.86  5999.  ]
 [21263.78 22075.  ]
 [ 7564.57  7949.  ]
 [ 5303.03  4990.  ]
 [13670.61 13295.  ]
 [ 9687.25 10791.  ]
 [ 9392.77  7999.  ]
 [ 9417.92  9495.  ]
 [ 7651.91  6298.  ]
 [22069.33 22975.  ]
 [10374.12 12500.  ]
 [15180.94 15995.  ]
 [17967.05 17995.  ]
 [ 7119.27  5899.  ]
 [12088.17 12795.  ]
 [13663.24 15369.  ]
 [23144.25 22498.  ]
 [10181.99 10995.  ]
 [ 6739.78  6720.  ]
 [ 8973.57  7990.  ]
 [10761.12 10493.  ]
 [13355.35 12790.  ]
 [ 6660.86  6430.  ]
 [ 8411.93  7758.  ]
 [ 9559.86  9795.  ]
 [21446.64 20689.  ]
 [16257.1  15450.  ]
 [ 8729.5   8000.  ]
 [ 6080.95  5498.  ]
 [ 9382.89  9995.  ]
 [ 6966.98  6190.  ]
 [ 7505.42  8000.  ]
 [12526.76 12771.  ]
 [ 5217.29  6500.  ]
 [ 6146.22  5850.  ]
 [21665.09 23

### 6.3 Decision Tree Regression results

In [29]:
y_pred_Decision_Tree = regressor_Decision_Tree.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_Decision_Tree.reshape(len(y_pred_Decision_Tree),1), y_test.reshape(len(y_test),1)),1))

[[17589.   16995.  ]
 [23450.   23995.  ]
 [16245.   12999.  ]
 [12995.   11295.  ]
 [21500.   19400.  ]
 [ 7000.    7000.  ]
 [11330.   12000.  ]
 [20997.   19997.  ]
 [ 9950.    8790.  ]
 [10495.   10495.  ]
 [10831.67  9995.  ]
 [14392.   15490.  ]
 [ 5995.    5999.  ]
 [22490.   22075.  ]
 [ 8500.    7949.  ]
 [ 4495.    4990.  ]
 [15495.   13295.  ]
 [ 8998.   10791.  ]
 [ 9000.    7999.  ]
 [ 9695.    9495.  ]
 [ 6800.    6298.  ]
 [21999.   22975.  ]
 [11360.   12500.  ]
 [14498.   15995.  ]
 [20495.   17995.  ]
 [ 7392.    5899.  ]
 [13000.   12795.  ]
 [14695.   15369.  ]
 [22991.   22498.  ]
 [ 9998.   10995.  ]
 [ 6600.    6720.  ]
 [ 9000.    7990.  ]
 [ 9995.   10493.  ]
 [14295.   12790.  ]
 [ 6500.    6430.  ]
 [ 8475.    7758.  ]
 [ 9495.    9795.  ]
 [20450.   20689.  ]
 [15995.   15450.  ]
 [ 8985.    8000.  ]
 [ 6295.    5498.  ]
 [ 7998.    9995.  ]
 [ 8000.    6190.  ]
 [ 7195.    8000.  ]
 [14490.   12771.  ]
 [ 5695.    6500.  ]
 [ 6998.    5850.  ]
 [23000.   23

### 6.4 Random Forest Regression results

In [30]:
y_pred_random_forest = regressor_random_forest.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_random_forest.reshape(len(y_pred_random_forest),1), y_test.reshape(len(y_test),1)),1))

[[15722.7  16995.  ]
 [23392.8  23995.  ]
 [15046.5  12999.  ]
 [11385.8  11295.  ]
 [21368.6  19400.  ]
 [ 7250.6   7000.  ]
 [11429.6  12000.  ]
 [19686.4  19997.  ]
 [ 9981.5   8790.  ]
 [10485.   10495.  ]
 [10880.18  9995.  ]
 [16526.9  15490.  ]
 [ 6077.    5999.  ]
 [22291.3  22075.  ]
 [ 8498.    7949.  ]
 [ 5313.    4990.  ]
 [14771.   13295.  ]
 [ 8983.6  10791.  ]
 [ 9208.    7999.  ]
 [ 9496.    9495.  ]
 [ 6745.4   6298.  ]
 [21811.   22975.  ]
 [11025.   12500.  ]
 [15496.9  15995.  ]
 [18444.4  17995.  ]
 [ 7044.9   5899.  ]
 [12766.7  12795.  ]
 [14373.5  15369.  ]
 [23087.6  22498.  ]
 [ 9853.2  10995.  ]
 [ 6607.9   6720.  ]
 [ 8922.2   7990.  ]
 [10148.2  10493.  ]
 [13724.2  12790.  ]
 [ 6543.5   6430.  ]
 [ 8266.    7758.  ]
 [ 9505.5   9795.  ]
 [20966.6  20689.  ]
 [15763.7  15450.  ]
 [ 8823.    8000.  ]
 [ 6064.5   5498.  ]
 [ 8450.8   9995.  ]
 [ 7508.6   6190.  ]
 [ 7436.1   8000.  ]
 [12886.5  12771.  ]
 [ 5779.3   6500.  ]
 [ 6872.5   5850.  ]
 [21898.6  23

## 7. Evaluating the Model Performance

### source: https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

R2 score is used to evaluate the performance of a linear regression model. It is the amount of the variation in the output dependent attribute which is predictable from the input independent variable(s) <=> larger -> better

The max_error() function computes the maximum residual error. A metric that captures the worst-case error between the predicted value and the true value. <=> smaller -> better

Mean Absolute Error calculates the average difference between the calculated values and actual values. It is also known as scale-dependent accuracy as it calculates error in observations taken on the same scale <=> smaller -> better

The Mean Squared Error (MSE) or Mean Squared Deviation (MSD) of an estimator measures the average of error squares i.e. the average squared difference between the estimated values and true value. It is a risk function, corresponding to the expected value of the squared error loss. It is always non – negative and values close to zero are better.

In [31]:
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error

### 7.1 Multiple Linear Regression evaluating

In [32]:
print("r2_score: ", r2_score(y_test, y_pred_linear))
print("max_error: ", max_error(y_test, y_pred_linear))
print("mean_absolute_error: ", mae(y_test, y_pred_linear))
print("mean_squared_error: ", mean_squared_error(y_test, y_pred_linear))

r2_score:  0.9313975555109449
max_error:  14013.427593231201
mean_absolute_error:  1073.5765178635143
mean_squared_error:  2647119.35357546


### 7.2 Support Vector Regression (SVR) evaluating

In [33]:
print("r2_score: ", r2_score(y_test_SVR, y_pred_SVR))
print("max_error: ", max_error(y_test_SVR, y_pred_SVR))
print("mean_absolute_error: ", mae(y_test_SVR, y_pred_SVR))
print("mean_squared_error: ", mean_squared_error(y_test_SVR, y_pred_SVR))

r2_score:  0.965860657247365
max_error:  10137.499759674822
mean_absolute_error:  785.4765736558304
mean_squared_error:  1317313.334705793


### 7.3 Decision Tree Regression evaluating

In [34]:
print("r2_score: ", r2_score(y_test, y_pred_Decision_Tree))
print("max_error: ", max_error(y_test, y_pred_Decision_Tree))
print("mean_absolute_error: ", mae(y_test, y_pred_Decision_Tree))
print("mean_squared_error: ", mean_squared_error(y_test, y_pred_Decision_Tree))

r2_score:  0.952015104685084
max_error:  8757.0
mean_absolute_error:  946.3100272007914
mean_squared_error:  1851562.9583384849


### 7.4 Random Forest Regression evaluating

In [35]:
print("r2_score: ", r2_score(y_test, y_pred_random_forest))
print("max_error: ", max_error(y_test, y_pred_random_forest))
print("mean_absolute_error: ", mae(y_test, y_pred_random_forest))
print("mean_squared_error: ", mean_squared_error(y_test, y_pred_random_forest))

r2_score:  0.9660568891880978
max_error:  7751.4000000000015
mean_absolute_error:  807.3378398627007
mean_squared_error:  1309741.456298655


In [36]:
regressor_random_forest.score(X_test,y_test)

0.9660568891880978

## Save the model

In [37]:
# run only 1 time 

# from joblib import dump, load
# dump(regressor_random_forest, 'regressor_random_forest.joblib') 

['regressor_random_forest.joblib']