### Import modules

In [10]:
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline


import featuretools as ft
import featuretools.variable_types as vtypes

import pickle

### Method 1: Catboost encoding plus normalization only

In [3]:
#Load the Kaggle cardiovascular dataset after EDA (same as the data set after data wrangling)
full_data = pd.read_csv("cardiovascular_data_after_data_wrangling.csv", index_col='id')

full_data.head()

Unnamed: 0_level_0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,50.391781,0.494708,110,80,0.494708,0.494708,0.494708,0.494708,0.494708,0,21.96712
1,55.419178,0.494708,140,90,0.494708,0.247354,0.247354,0.247354,0.247354,1,34.927679
2,51.663014,0.747354,130,70,0.747354,0.498236,0.498236,0.498236,0.494708,1,23.507805
3,48.282192,0.247354,150,100,0.247354,0.623677,0.623677,0.623677,0.498236,1,28.710479
4,47.873973,0.831569,100,60,0.498236,0.698942,0.698942,0.698942,0.747354,0,23.011177


In [4]:
#Get X y values
X = full_data.drop(columns=['cardio'])
y = full_data.cardio

print(f"X shape is {X.shape}")
print(f"y shape is {y.shape}")

X shape is (68588, 10)
y shape is (68588,)


###  Splitting train and test dataset, then standard scaling on train data and fit on test data to prevent information leakage

In [7]:
#split train and test dataset
'''
#Scale on train data and fit on test dataset to prevent information leakage
'''


split_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)


sc = StandardScaler()


X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)



with open('train_test_data_from_preprocessing.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)
f.close()


### Method 2, first run PCA on train data, fit on test data, use PCs as features

In [11]:
#split train and test data
split_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)



#Build the pipeline for PCA
scaler = StandardScaler()

pca = PCA(n_components='mle')

pipeline = make_pipeline(scaler, pca)


#fit on train data and transform on test data

X_train = pipeline.fit_transform(X_train)

X_test = pipeline.transform(X_test)


print(f"After PCA with mle, X_train shape is {X_train.shape}")
print(f"After PCA with mle, X_test shape is {X_test.shape}")


#save result into pkl file
with open('train_test_data_from_preprocessing_plus_PCA.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)
f.close()


After PCA with mle, X_train shape is (54870, 9)
After PCA with mle, X_test shape is (13718, 9)


### Method 3 Synthesize new features from original dataset using 'featuretools'
<b>New features will be generated on train data then transform on test dataset</b>

In [5]:
#load the original data before catboot encoding
full_data = pd.read_csv("all_filtered_data_before_catboost.csv", index_col='id')

X = full_data.drop(columns=['cardio'])
y = full_data.cardio

print(f"X shape is {X.shape}")
print(f"y shape is {y.shape}")

#train test splitting
split_ratio = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)


#Built Entity set on train data
X_train.reset_index(drop=True, inplace=True)

X_train['id'] = X_train.index

es = ft.EntitySet(id="cardio_data")

es.add_dataframe(dataframe_name = 'X_train', dataframe = X_train, index = 'id')

es

Entityset: cardio_data
  DataFrames:
    X_train [Rows: 54870, Columns: 11]
  Relationships:
    No relationships

In [6]:
es["X_train"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
age,Double,['numeric']
gender,Categorical,['category']
ap_hi,Integer,['numeric']
ap_lo,Integer,['numeric']
cholesterol,Categorical,['category']
gluc,Categorical,['category']
smoke,Categorical,['category']
alco,Categorical,['category']
active,Categorical,['category']
BMI,Double,['numeric']


In [7]:
#add new entity sets to create new features

es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="phystical",
    index="gender"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="wine",
    index="alco"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="smoke",
    index="smoke"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="active",
    index="active"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)


es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="gluc",
    index="gluc"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es = es.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="cholesterol",
    index="cholesterol"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es

Entityset: cardio_data
  DataFrames:
    X_train [Rows: 54870, Columns: 11]
    phystical [Rows: 2, Columns: 1]
    wine [Rows: 2, Columns: 1]
    smoke [Rows: 2, Columns: 1]
    active [Rows: 2, Columns: 1]
    gluc [Rows: 3, Columns: 1]
    cholesterol [Rows: 3, Columns: 1]
  Relationships:
    X_train.gender -> phystical.gender
    X_train.alco -> wine.alco
    X_train.smoke -> smoke.smoke
    X_train.active -> active.active
    X_train.gluc -> gluc.gluc
    X_train.cholesterol -> cholesterol.cholesterol

In [8]:
#generate new features on train date
feature_matrix, feature_names = ft.dfs(entityset=es, 
    target_dataframe_name = 'X_train', 
    max_depth = 3, 
    verbose = 3, 
    n_jobs = -1,
    ignore_columns={'X_train':['id']}
)

Built 214 features
EntitySet scattered to 8 workers in 2 seconds                                   
Elapsed: 00:02 | Progress: 100%|████████████████████████████████████████████████


In [9]:
#encoding features
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_names, include_unknown=False)
X_train = feature_matrix_enc.copy()

In [10]:
#checking new train features and labels

print(X_train.head())
y_train.index = X_train.index
print(y_train)

          age  ap_hi  ap_lo        BMI  phystical.MAX(X_train.BMI)  \
id                                                                   
0   51.783562    120     80  33.331832                   68.308315   
1   49.391781    117     73  22.790329                   68.308315   
2   60.093151    140     80  37.015532                   68.308315   
3   60.087671    100     70  25.661152                   65.381084   
4   60.205479    110     70  25.209201                   65.381084   

    phystical.MAX(X_train.age)  phystical.MAX(X_train.ap_hi)  \
id                                                             
0                    64.934247                         240.0   
1                    64.934247                         240.0   
2                    64.934247                         240.0   
3                    64.967123                         240.0   
4                    64.967123                         240.0   

    phystical.MAX(X_train.ap_lo)  phystical.MEAN(X_train.BMI

In [13]:
'''
Fill NAs based on median value of each feature
'''
X_train.fillna(X_train.median(), inplace=True)

In [15]:
'''
Repeat the same exercise to transform test dataset
'''


# creating and entity set 'es'
es_tst = ft.EntitySet(id = 'cardio_data')
# adding a dataframe



es_tst.add_dataframe(dataframe_name = 'X_train', dataframe = X_test, index = 'id')
# add PCLass entity



es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="phystical",
    index="gender"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="wine",
    index="alco"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="smoke",
    index="smoke"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)

es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="active",
    index="active"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)



es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="gluc",
    index="gluc"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)



es_tst = es_tst.normalize_dataframe(
    base_dataframe_name="X_train",
    new_dataframe_name="cholesterol",
    index="cholesterol"#,
    #additional_columns=['ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI'],
)
es_tst



Entityset: cardio_data
  DataFrames:
    X_train [Rows: 13718, Columns: 11]
    phystical [Rows: 2, Columns: 1]
    wine [Rows: 2, Columns: 1]
    smoke [Rows: 2, Columns: 1]
    active [Rows: 2, Columns: 1]
    gluc [Rows: 3, Columns: 1]
    cholesterol [Rows: 3, Columns: 1]
  Relationships:
    X_train.gender -> phystical.gender
    X_train.alco -> wine.alco
    X_train.smoke -> smoke.smoke
    X_train.active -> active.active
    X_train.gluc -> gluc.gluc
    X_train.cholesterol -> cholesterol.cholesterol

In [16]:
'''
Encode test features, fill NA and check the content of new test features and labels
'''
feature_matrix_tst = ft.calculate_feature_matrix(features=features_enc, entityset=es_tst)

In [17]:
feature_matrix_tst.fillna(feature_matrix_tst.median(), inplace=True)

In [18]:
X_test = feature_matrix_tst.copy()

In [19]:
print(X_test.head())
y_test.index = X_test.index.tolist()
print(y_test.head())

          age  gender = F  gender = M  ap_hi  ap_lo  cholesterol = normal  \
id                                                                          
0   49.904110       False        True    120     80                 False   
1   54.210959        True       False    120     80                  True   
2   50.227397        True       False    120     80                  True   
3   50.000000        True       False    110     70                  True   
4   55.400000       False        True    160    100                  True   

    cholesterol = above normal  cholesterol = well above normal  \
id                                                                
0                        False                             True   
1                        False                            False   
2                        False                            False   
3                        False                            False   
4                        False                            

In [20]:
'''
Select shared columns as common features for new train and test set
'''

shared_columns = set(X_train.columns.tolist()).intersection(set(X_test.columns.tolist()))

In [21]:
X_test = X_test[shared_columns]
X_train = X_train[shared_columns]

In [22]:
print(f"X_train final shape is {X_train.shape}")
print(f"X_test final shape is {X_test.shape}")

X_train final shape is (54870, 226)
X_test final shape is (13718, 226)


In [13]:
'''
save the new train and test data to pkl file
'''
with open('train_test_data_from_featuretools.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)
f.close()



