# Feature Engineering

### 1. Importing Necessary Libraries

In [24]:
# importing the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
import os

import scipy.stats as stats
import pylab

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib


In [2]:
# Loading the data set
df = pd.read_csv("C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\data\\raw data\\heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# shape of the dataset
df.shape

(303, 14)

## 2. Train Test Split 

In [4]:
target = "output"

In [5]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(df.drop(target, axis = 1),
                                                    df[target],
                                                    test_size = 0.1,
                                                    random_state = 42)

In [6]:
# shapes of each split
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((272, 13), (272,), (31, 13), (31,))

## 3. Feature Engineering
### 3.1 Numerical Variables
#### 3.1.1 Gaussian Transforamtion
Here we apply
1. reciprocal transformation on trtbps variable
2. yeo-johnson transformation on chol variable


In [7]:
# let's apply a reciprocal transformation on the train and test sets trtbps variable
x_train["trtbps"] = 1/x_train["trtbps"]
x_test["trtbps"] = 1/x_test["trtbps"]

In [8]:
# let's apply a yeo johnson transformation on the train and test sets of the chol variable.
x_train["chol"], param = stats.yeojohnson(x_train["chol"])
x_train.head()
print(param)

-0.28607589196926814


In [10]:
# now let's apply the above transformation on the test set
x_test["chol"] = stats.yeojohnson(x_test["chol"], lmbda = param)

In [12]:
# let's confirm there are no missing values in the above two columns
df[["trtbps","chol"]].isnull().sum()

trtbps    0
chol      0
dtype: int64

#### 3.1.2 Feature Scaling

In [14]:
# now let's apply feature scalling 
# first initialize the scaler
scaler = StandardScaler()

In [15]:
# now let's fit our scaler with the training data
scaler.fit(x_train)

In [17]:
# now let's transform both x_train and x_test
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns)


In [18]:
# let's visualize the training set
x_train.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,-5.889159,0.01204,-1.892439,-678.802971,-85.427619,-1.587641,-2.971017,-6.854053,2.399345,-0.257607,-0.744638,-0.462742,-1.927131
1,-5.877109,0.01204,-1.892439,599.625707,-54.135478,-1.587641,0.629003,-6.934519,2.399345,-0.333054,-3.463139,0.465772,-1.927131
2,-6.081967,0.01204,-1.892439,1976.395053,-84.814153,-1.587641,-2.971017,-6.833421,2.399345,0.5723,-3.463139,-1.391256,-4.592261
3,-5.828907,-4.584667,0.99279,-1190.174443,-74.378031,-1.587641,0.629003,-6.903571,-2.169825,0.270515,-6.18164,-1.391256,-4.592261
4,-5.865058,0.01204,-1.892439,-88.758966,-60.097026,-1.587641,-2.971017,-6.835484,-2.169825,-0.634838,-3.463139,-0.462742,-1.927131


In [20]:
x_test.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,0.288158,0.685763,-0.955447,-1.140737,0.698476,-0.415227,-1.025418,-1.74126,1.445998,-0.383843,-0.684974,0.255069,-2.136684
1,0.507707,0.685763,1.986608,-1.953507,0.903244,-0.415227,-1.025418,0.393608,-0.691564,-0.731282,-0.684974,-0.708525,1.128361
2,0.288158,0.685763,1.005923,-1.140737,-3.545544,2.408319,0.871954,1.029526,-0.691564,-0.731282,0.963813,0.255069,1.128361
3,0.178384,-1.458231,-0.955447,-0.315836,2.500423,-0.415227,-1.025418,-0.015197,1.445998,0.745333,-0.684974,1.218663,1.128361
4,1.825003,-1.458231,1.005923,1.371461,0.50049,2.408319,-1.025418,-0.923651,-0.691564,-0.905002,0.963813,0.255069,-0.504161


## 4. Saving the Datasets

In [23]:
# let's save the training and test sets for bothe the feature and target variables
x_train.to_csv("C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\data\\processed data\\x_train.csv")
x_test.to_csv("C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\data\\processed data\\x_test.csv")
y_train.to_csv("C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\data\\processed data\\y_train.csv")
y_test.to_csv("C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\data\\processed data\\y_test.csv")

## 5. Saving the Scaler

In [25]:
# saving the scaler
joblib.dump(scaler,"C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\models\\scaler.joblib" )

['C:\\Users\\yozil\\Desktop\\My projects\\10. End_to_End_Heart_Attack_Risk_Prediction\\models\\scaler.joblib']