<h2>Predicting Fuel Efficiency of Vehicles - Part 3 </h2>

<h3>Selecting and Training Models</h3>

1. Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)
2. Evaluation using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Check Feature Importance
6. Evaluate the Final System on test data
7. Saving the Model

In [3]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



import warnings
warnings.filterwarnings('ignore')

In [5]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [6]:
##segregate the feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [8]:
##preprocess the Origin column in data  
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})  #renaming 1, 2, 3 for India, USA, Germany
    return df

In [11]:
##creating custom attribute adder class  (It works the same way like part 2)
acc_ix, hpower_ix, cyl_ix = 4,2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

In [12]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    #print(list(num_attrs)) I can delete this
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

<h3>From raw data to processed data in 2 steps</h3>

In [13]:
##from raw data to processed data in 2 steps
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [14]:
prepared_data[0]  #the first rows of the prepared data

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

<h3>Selecting and Training Models</h3>
1. Linear Regression <br>
2. Decision Tree <br>
3. Random Forest <br>
4. SVM regressor <br>

In [16]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)#it gives a linear regression object

LinearRegression()

In [17]:
##testing the predictions
sample_data = data.iloc[:5]  #5 rows for a sample data
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared)) 
#if I want to print the name of the labels, activate #print(list(num_attrs)) in num_pipeline_transformer above

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [18]:
print("Actual Labels of samples: ", list(sample_labels))  #this is how linear regression perform:

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


<h4>Mean Squared Error</h4>

In [19]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

<h3>Decision Tree</h3>


In [22]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [23]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent. (the model is highly overfitting the data, it fits well the training (seen) data but it performs poorly on unseen data which is has not seen yet)

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

Model Evaluation using Cross Validation
Scikit-Learn’s K-fold cross-validation feature randomly splits the training set into K distinct subsets called folds, 
then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the 
other K-1 folds.

The result is an array containing the K evaluation scores:

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,   #this function will give me 10 scores
                         prepared_data, 
                         data_labels, 
                         scoring="neg_mean_squared_error",  #this is an argument (neg_mean_squared_error), it accepts an argument
                         cv = 10) #10 cross validations or 10 folds
tree_reg_rmse_scores = np.sqrt(-scores)

In [26]:
tree_reg_rmse_scores  #10 scores root mean square error values for the 10 folds

array([3.1257999 , 3.23684839, 3.08043219, 3.36841951, 2.28992358,
       2.99134167, 3.09652022, 4.39349661, 4.12928175, 2.71602176])

In [27]:
tree_reg_rmse_scores.mean()  #the mean of the 10 scores Result: 3.2428

3.242808558763806

In [28]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores   #the same for linear regression

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [29]:
lin_reg_rmse_scores.mean() 

3.0757081793709324

<h3>Random Forest model</h3>

In [30]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                         prepared_data,
                                         data_labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()  
#2.558591, so far random forest es the best performer

2.5585913052822704

<h3>Support Vector Machine Regressor</h3>

In [31]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')  
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()  #Resp.: 3.09, rforest still being the best model

3.08659162080283

<h3>Hyperparameter Tuning using GridSearchCV</h3>

In [32]:
from sklearn.model_selection import GridSearchCV  #to fine-tune the hiperparameters of the random f.regressor. This to 
                                                  # improve the performance of r.f.model.
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [33]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [34]:
cv_scores = grid_search.cv_results_

##printing all the parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)
    #Res.: I am going to choose 2.64 (the lowest value, no 2.55 from above. This is the improved r.forest)

3.5715192218398215 {'max_features': 2, 'n_estimators': 3}
3.1733763358529687 {'max_features': 2, 'n_estimators': 10}
2.9582545279429935 {'max_features': 2, 'n_estimators': 30}
3.2884828756823277 {'max_features': 4, 'n_estimators': 3}
2.9157676481198873 {'max_features': 4, 'n_estimators': 10}
2.7541346184672575 {'max_features': 4, 'n_estimators': 30}
3.1558401868513934 {'max_features': 6, 'n_estimators': 3}
2.818444901591381 {'max_features': 6, 'n_estimators': 10}
2.6435815144546435 {'max_features': 6, 'n_estimators': 30}
2.8216031890490223 {'max_features': 8, 'n_estimators': 3}
2.7937427545550846 {'max_features': 8, 'n_estimators': 10}
2.731538156637186 {'max_features': 8, 'n_estimators': 30}
3.473805822892421 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.822809813277452 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.364310680886484 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.928852924891443 {'bootstrap': False, 'max_features': 3, 'n_estim

<h3>Checking Feature importance</h3>

In [35]:
# feature importances 

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances #ampliar

array([0.19300645, 0.16976561, 0.16414755, 0.26108427, 0.01822394,
       0.12227059, 0.02604562, 0.04004234, 0.00202216, 0.00194882,
       0.00144264])

In [36]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.0260456199701643),
 ('acc_on_cyl', 0.04004234272187999),
 ('Weight', 0.2610842693240977),
 ('Model Year', 0.12227059426783451),
 ('Horsepower', 0.16414754588211078),
 ('Displacement', 0.1697656104126077),
 ('Cylinders', 0.19300645425296942),
 ('Acceleration', 0.018223944070466944)]

<h3>Evaluating the entire system on Test Data</h3>

In [37]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [38]:
final_rmse  #Resp.: 3.0797, we can iterate to improve this result

3.079721127346148

<h3>Creating a function to cover this entire flow</h3>

In [44]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config          #By default:true
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    #print(prepared_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [45]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model) 
#Res. 3 predictions for 3 rows, it should be close to what I have seen in the training data

array([32.56      , 17.64666667, 21.64      ])

<h3>Save the Model</h3>

In [46]:
import pickle

In [47]:
##saving the model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [48]:
##loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([32.56      , 17.64666667, 21.64      ])

In [None]:
#These results are the same as the predictions in 45th line because of the same vehicle configuration

In [58]:
import requests   #install requests packages using cmd, pip install requests

In [None]:
import requests

url = "http://localhost:9696/"  #"http://0.0.0.0:9696/" 
r = requests.post(url, json = vehicle_config)
r.text.strip()