In [116]:
#Importing necessary libraries

from sklearn.ensemble         import BaggingRegressor
from sklearn.metrics          import mean_squared_error
from sklearn.model_selection  import train_test_split
from sklearn.model_selection  import GridSearchCV
from math                     import sqrt

import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt

In [117]:
#Reading Dataset

lv_cars_df = pd.read_csv('/Users/ujjwalrohit/Dataset for machine Learning/auto-mpg.csv.xls')


In [118]:
#Displaying the characteristics of the car data set

print("Dimension of the data set",lv_cars_df.shape)
print("Names of the variables in the data set are:\n",lv_cars_df.columns)
print("The information of the data set is:\n",lv_cars_df.info())

Dimension of the data set (398, 9)
Names of the variables in the data set are:
 Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB
The information of the data set is:
 None


In [119]:
#Drop the car name , model yearin and Originthe dataset


lv_cars_df = lv_cars_df.drop(['model year','car name','origin','horsepower'],axis = 1)


print(lv_cars_df.head())

    mpg  cylinders  displacement  weight  acceleration
0  18.0          8         307.0    3504          12.0
1  15.0          8         350.0    3693          11.5
2  18.0          8         318.0    3436          11.0
3  16.0          8         304.0    3433          12.0
4  17.0          8         302.0    3449          10.5


In [120]:
#Check null values and N/A in the data set

print("The Null values in the data set are:\n",lv_cars_df.isnull().sum())
print("The NA values in the data set are:\n",lv_cars_df.isna().sum())

The Null values in the data set are:
 mpg             0
cylinders       0
displacement    0
weight          0
acceleration    0
dtype: int64
The NA values in the data set are:
 mpg             0
cylinders       0
displacement    0
weight          0
acceleration    0
dtype: int64


In [121]:
#Using a random seed function for generating the same data set

np.random.seed(3000)

lv_training,lv_test = train_test_split(lv_cars_df,test_size = 0.3,random_state = 0)

lv_x_train = lv_training.drop(['mpg'],axis = 1)
lv_y_train = lv_training['mpg']

lv_x_test = lv_test.drop(['mpg'],axis = 1)
lv_y_test = lv_test['mpg']

In [122]:

print(lv_x_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278 entries, 230 to 172
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     278 non-null    int64  
 1   displacement  278 non-null    float64
 2   weight        278 non-null    int64  
 3   acceleration  278 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 10.9 KB
None


In [123]:
#Creating a bagging model

print("------------------------BAGGING MODEL-------------------")

lv_car_bag = BaggingRegressor(random_state=0)
lv_car_bag.fit(lv_x_train,lv_y_train)

------------------------BAGGING MODEL-------------------


BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                 warm_start=False)

In [124]:
#Calculate RMSE for the model

print('RMSE value for bagging model :',sqrt(mean_squared_error(lv_y_test,lv_car_bag.predict(lv_x_test))
                                           ))

RMSE value for bagging model : 4.347631179734545


In [125]:
# Creating a new bagging model with best parameters

print("---------------BAGGING MODEL WITH BEST PARAMETERS----------------")

lv_grid = {'n_estimators' : [10,20,30],
          'max_samples' : [0.5,0.8,1.0],
          'max_features' : [0.5,0.7,1.0]}

lv_car_bag_grid = BaggingRegressor()

lv_car_bag_cv = GridSearchCV(estimator=lv_car_bag_grid,param_grid=lv_grid,cv=5)
lv_car_bag_cv.fit(lv_x_train,lv_y_train)

print("Best Parameters:\n",lv_car_bag_cv.best_params_)

---------------BAGGING MODEL WITH BEST PARAMETERS----------------
Best Parameters:
 {'max_features': 0.7, 'max_samples': 0.8, 'n_estimators': 20}


In [126]:
#Creating the model with best score

lv_car_best = BaggingRegressor(n_estimators= lv_car_bag_cv.best_params_['n_estimators'],
                              max_samples=lv_car_bag_cv.best_params_['max_samples'],
                              max_features=lv_car_bag_cv.best_params_['max_features'])

In [127]:
#Evaluating the model considering best parameters

lv_car_best.fit(lv_x_train,lv_y_train)

print("Accuracy on training set with best parameters:",lv_car_best.score(lv_x_train,
                                                        lv_y_train))

print("Accuracy on test data set with best parameters:",lv_car_best.score(lv_x_test,
                                                                         lv_y_test))

Accuracy on training set with best parameters: 0.9072093705983153
Accuracy on test data set with best parameters: 0.7593011078871925


In [128]:
#Calculate RMSE for the new model

print("RMSE value for new bagging model:",sqrt(mean_squared_error(lv_y_test,
                                                lv_car_best.predict(lv_x_test))))

RMSE value for new bagging model: 3.921876126945267
