## In this notebook we shall perform various feature engineering techniques and create models:

In [1]:
# importing the required libraries:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, LabelEncoder, OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR


In [3]:
# importing the data onto which we shall perform feature engineering:

data = pd.read_csv('fedata.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,0,19,female,27.9,0,yes,southwest,16884.92
1,1,18,male,33.8,1,no,southeast,1725.55
2,2,28,male,33.0,3,no,southeast,4449.46
3,3,33,male,22.7,0,no,northwest,21984.47
4,4,32,male,28.9,0,no,northwest,3866.86


In [4]:
# dropping the unwanted columns before we proceed further:

data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
# checking for null values and duplicates before proceeding.

data.isnull().sum()

# there are not null values

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [7]:
# checking duplicates
data.duplicated().sum()

# there are no duplicates

0

## Saperating X and y.

In [8]:
# Now we shall saperate the data as X and y (ie dependent and independent variables).
# Then we shall perform train test split on the data.

X = data.drop(['expenses'], axis=1)
y = data['expenses']

In [9]:
# checking the shape of the X and y data to ensure the right number of rows:

X.shape

(1337, 6)

In [10]:
y.shape

# we can observe that, there are 1337 rows in both X and y.

(1337,)

## Splitting the data as train and test.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# checking the rows and columns for the split data.

print('Shape of X_train is:', X_train.shape)
print('Shape of y_train is:', y_train.shape)
print('Shape of X_test is:', X_test.shape)
print('Shape of y_test is:', y_test.shape)

# The columns and rows for the train and test sets are matching

Shape of X_train is: (1069, 6)
Shape of y_train is: (1069,)
Shape of X_test is: (268, 6)
Shape of y_test is: (268,)


## Feature Encoding and Scaling:

In [15]:
all_columns = list(X)
numeric_columns = ['age','bmi', 'children']
categorical_columns = [x for x in all_columns if x not in numeric_columns]

(numeric_columns), (categorical_columns)

(['age', 'bmi', 'children'], ['sex', 'smoker', 'region'])

In [16]:
# Here we shall transform the numeric and categorical features:

# Scaling numeric data using standard scaler
numeric_features_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Tending to categorical variables using OneHotEncoder
categorical_features_pipeline = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder()),
]
)

In [17]:
preprocessor = ColumnTransformer(
    [
        ("Numeric Pipeline",numeric_features_pipeline, numeric_columns),
        ("Categorical Features Pipeline",categorical_features_pipeline, categorical_columns)
]
)

In [18]:
# Fitting and transforming the preprocessor to our data:

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)