## In this notebook we shall perform various feature engineering techniques and create models:

In [1]:
# importing the required libraries:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, LabelEncoder, OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR


In [3]:
# importing the data onto which we shall perform feature engineering:

data = pd.read_csv('fedata.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,0,19,female,27.9,0,yes,southwest,16884.92
1,1,18,male,33.8,1,no,southeast,1725.55
2,2,28,male,33.0,3,no,southeast,4449.46
3,3,33,male,22.7,0,no,northwest,21984.47
4,4,32,male,28.9,0,no,northwest,3866.86


In [4]:
# dropping the unwanted columns before we proceed further:

data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
# checking for null values and duplicates before proceeding.

data.isnull().sum()

# there are not null values

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [6]:
# checking duplicates
data.duplicated().sum()

# there are no duplicates

0

## Saperating X and y.

In [7]:
# Now we shall saperate the data as X and y (ie dependent and independent variables).
# Then we shall perform train test split on the data.

X = data.drop(['expenses'], axis=1)
y = data['expenses']

In [8]:
# checking the shape of the X and y data to ensure the right number of rows:

X.shape

(1337, 6)

In [9]:
y.shape

# we can observe that, there are 1337 rows in both X and y.

(1337,)

## Splitting the data as train and test.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# checking the rows and columns for the split data.

print('Shape of X_train is:', X_train.shape)
print('Shape of y_train is:', y_train.shape)
print('Shape of X_test is:', X_test.shape)
print('Shape of y_test is:', y_test.shape)

# The columns and rows for the train and test sets are matching

Shape of X_train is: (1069, 6)
Shape of y_train is: (1069,)
Shape of X_test is: (268, 6)
Shape of y_test is: (268,)


## Feature Encoding and Scaling:

In [12]:
all_columns = list(X)
numeric_columns = ['age','bmi', 'children']
categorical_columns = [x for x in all_columns if x not in numeric_columns]

(numeric_columns), (categorical_columns)

(['age', 'bmi', 'children'], ['sex', 'smoker', 'region'])

In [13]:
# Here we shall transform the numeric and categorical features:

# Scaling numeric data using standard scaler
numeric_features_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Tending to categorical variables using OneHotEncoder
categorical_features_pipeline = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder()),
]
)

In [14]:
preprocessor = ColumnTransformer(
    [
        ("Numeric Pipeline",numeric_features_pipeline, numeric_columns),
        ("Categorical Features Pipeline",categorical_features_pipeline, categorical_columns)
]
)

In [15]:
# Fitting and transforming the preprocessor to our data:

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Model Selection:

- In this section, we shall consider various regression models with their default parameters.
- Then, we will select the top 3 models based on performance and perform hyperparameter tuning.

In [16]:
# creating a function to calculate the metrics:

def evaluate_regression(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse,r2_square

In [17]:
# creating a list of models:

models = {
    "Linear Regression": LinearRegression(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False, max_depth=5),
    "AdaBoost Regressor": AdaBoostRegressor(), 
    "SVR": SVR()
}

In [18]:
# creating a function to evaluate the models and return a report:


def evaluate_models(X_train,X_test,y_train,y_test, models):
    '''

    '''
    
    models_list = []
    r2_list = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,y_train) # Train Model

        # predictions on train and test set:
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # evaluate train and test models:
        # train
        model_train_mae, model_train_rmse, model_train_r2 = evaluate_regression(y_train, y_train_pred)

        #test
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_regression(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))

        print('----------------------------------')

        print('Model performance for Test set')
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        r2_list.append(model_test_r2)

        print('='*35)
        print('\n')

    report=pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name', 'r2_score']).sort_values(by=['r2_score'], ascending=False)

    return report



In [19]:
base_report = evaluate_models(X_train, X_test, y_train, y_test, models)

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 6080.7835
- Mean Absolute Error: 4181.8256
- R2 Score: 0.7299
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5956.6326
- Mean Absolute Error: 4177.2676
- R2 Score: 0.8069


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 4831.8432
- Mean Absolute Error: 2914.4757
- R2 Score: 0.8295
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6337.6034
- Mean Absolute Error: 3806.2637
- R2 Score: 0.7814


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6351.9729
- Mean Absolute Error: 2911.9573
- R2 Score: 0.7804


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 1872.0907
- Mean Absolu

In [20]:
base_report

Unnamed: 0,Model Name,r2_score
5,CatBoosting Regressor,0.893173
3,Random Forest Regressor,0.878974
6,AdaBoost Regressor,0.865831
4,XGBRegressor,0.858965
0,Linear Regression,0.80691
1,K-Neighbors Regressor,0.781421
2,Decision Tree,0.780429
7,SVR,-0.132424


- From the base report and model performance report, we can observe that CatBoost, RandomForest and XGBoost are the best performing models.

- We shall perform, hyperparameter tuning for the above 3 models.

## Hyperparameter Tuning:

In [29]:
#Initialize Hyperparams:
rf_params = {"max_depth":[5,8,10,15],
             "max_features":[2,3,4,5,"auto"],
             "min_samples_split":[2,8,15,20],
             "n_estimators":[100,200,500]

}

xgboost_params = {"learning_rate":[0.1,0.01],
                  "max_depth":[5,8,12,20,30],
                  "n_estimators":[100,200,300],
                  "colsample_bytree":[0.5,0.8,1,0.3,0.4]

}

cat_params = {"learning_rate": [0.1, 0.01, 0.6, 0.5],
              "max_depth": [4, 5, 6, 8, 12]

}

In [31]:
# Creating Models list for Hyperparameter tuning:
randomcv_models = [("RF", RandomForestRegressor(), rf_params),
                   ('XGBoost', XGBRegressor(), xgboost_params),
                   ('CatBoost', CatBoostRegressor(verbose=False), cat_params)
                   ]

In [32]:
# Fitting the model and extracting best params:

# importing the required library
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

print()



Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, max_features=auto, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=auto, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END max_depth=15, max_features=2, min_samples_split=20, n_estimators=500; total time=   1.0s
[CV] END max_depth=15, max_features=2, min_samples_split=20, n_estimators=500; total time=   1.9s
[CV] END max_depth=15, max_features=2, min_samples_split=20, n_estimators=500; total time=   1.1s
[CV] END max_depth=10, max_features=5, min_samples_spli



Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END .....................learning_rate=0.1, max_depth=4; total time=   2.0s
[CV] END .....................learning_rate=0.1, max_depth=4; total time=   1.7s
[CV] END .....................learning_rate=0.1, max_depth=4; total time=   1.7s
[CV] END .....................learning_rate=0.1, max_depth=5; total time=   2.1s
[CV] END .....................learning_rate=0.1, max_depth=5; total time=   2.2s
[CV] END .....................learning_rate=0.1, max_depth=5; total time=   2.1s
[CV] END .....................learning_rate=0.1, max_depth=6; total time=   2.8s
[CV] END .....................learning_rate=0.1, max_depth=6; total time=   2.9s
[CV] END .....................learning_rate=0.1, max_depth=6; total time=   2.8s
[CV] END .....................learning_rate=0.1, max_depth=8; total time=   6.1s
[CV] END .....................learning_rate=0.1, max_depth=8; total time=   6.1s
[CV] END .....................learning_rate=0.1,

In [33]:
print()

#printing best params
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])


---------------- Best Params for RF -------------------
{'n_estimators': 100, 'min_samples_split': 15, 'max_features': 'auto', 'max_depth': 5}
---------------- Best Params for XGBoost -------------------
{'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1}
---------------- Best Params for CatBoost -------------------
{'max_depth': 4, 'learning_rate': 0.01}


In [34]:
# reInitialize models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(**model_param['RF']),
    "XGBRegressor": XGBRegressor(**model_param['XGBoost']), 
    "CatBoosting Regressor": CatBoostRegressor(**model_param['CatBoost'],verbose=False)
}

In [35]:
retrained_report = evaluate_models(X_train, X_test, y_train, y_test, models)

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 4112.7086
- Mean Absolute Error: 2271.9461
- R2 Score: 0.8765
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4273.5135
- Mean Absolute Error: 2464.6254
- R2 Score: 0.9006


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 3997.7700
- Mean Absolute Error: 2021.0685
- R2 Score: 0.8833
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4479.1105
- Mean Absolute Error: 2270.7348
- R2 Score: 0.8908


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 4175.6719
- Mean Absolute Error: 2290.0451
- R2 Score: 0.8726
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4249.0184
- Mean Absolute Error: 2427.3099
- R2 Score: 0.9017




In [36]:
retrained_report

Unnamed: 0,Model Name,r2_score
2,CatBoosting Regressor,0.901749
0,Random Forest Regressor,0.900613
1,XGBRegressor,0.89082


## We get the Best Model as CatBoost Regressor with 90.17% Score.