In [73]:
import pandas as pd
import numpy as np
import re

from optbinning import OptimalBinning

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [74]:
df_train = pd.read_csv("./data/train.csv")
df_train.shape

(891, 12)

In [75]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## **Feature Engineering**

### Binning Feature Age

In [76]:
x = df_train['Age'].values
y = df_train['Survived']

In [77]:
optb = OptimalBinning(name='Age', dtype="numerical", solver="cp")

optb.fit(x, y)
optb.status

'OPTIMAL'

In [78]:
binning_table = optb.binning_table
binning_table.build()

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 6.50)",47,0.05275,14,33,0.702128,-1.330738,0.09447,0.011008
1,"[6.50, 17.50)",66,0.074074,38,28,0.424242,-0.167906,0.002125,0.000265
2,"[17.50, 21.50)",91,0.102132,65,26,0.285714,0.443003,0.018772,0.002327
3,"[21.50, 26.50)",115,0.129068,72,43,0.373913,0.042178,0.000228,2.9e-05
4,"[26.50, 30.75)",92,0.103255,56,36,0.391304,-0.031455,0.000103,1.3e-05
5,"[30.75, 47.50)",214,0.24018,127,87,0.406542,-0.095009,0.002191,0.000274
6,"[47.50, inf)",89,0.099888,52,37,0.41573,-0.132962,0.001791,0.000224
7,Special,0,0.0,0,0,0.0,0.0,0.0,0.0
8,Missing,177,0.198653,125,52,0.293785,0.403782,0.030542,0.003792
Totals,,891,1.0,549,342,0.383838,,0.15022,0.017931


In [79]:
def age_binning(data):
    if data < 0 and data > 6.5:
        return 'toddler'
    elif data > 6.5 and data < 17.5:
        return 'children'
    elif data > 17.5 and data < 21.5:
        return 'teenager'
    elif data > 21.5 and data < 26.5:
        return 'young adult'
    elif data > 26.5 and data < 30.75:
        return 'middle adult'
    elif data > 30.75 and data < 47.5:
        return 'old adult'
    else :
        return 'retirement'

In [80]:
df_train['Age Class'] = df_train['Age'].apply(age_binning)

### Binning Feature Fare

In [81]:
x = df_train['Fare'].values
y = df_train['Survived']

In [82]:
optb = OptimalBinning(name='Fare', dtype="numerical", solver="cp", max_n_bins = 3)

optb.fit(x, y)
optb.status

'OPTIMAL'

In [83]:
binning_table = optb.binning_table
binning_table.build()

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 10.48)",339,0.380471,272,67,0.19764,0.927822,0.27792,0.033545
1,"[10.48, 74.38)",455,0.510662,254,201,0.441758,-0.239258,0.029922,0.003731
2,"[74.38, inf)",97,0.108866,23,74,0.762887,-1.641859,0.286471,0.032261
3,Special,0,0.0,0,0,0.0,0.0,0.0,0.0
4,Missing,0,0.0,0,0,0.0,0.0,0.0,0.0
Totals,,891,1.0,549,342,0.383838,,0.594313,0.069538


In [84]:
def fare_binning(data):
    if data < 10.48:
        return 'cheap'
    elif data > 10.48 and data < 74.38:
        return 'medium'
    else :
        return 'expensive'

In [85]:
df_train['Fare Class'] = df_train['Fare'].apply(fare_binning)
df_train['Fare Class'].head()

0     cheap
1    medium
2     cheap
3    medium
4     cheap
Name: Fare Class, dtype: object

In [86]:
df_train['Fare Class'].value_counts()

Fare Class
medium       455
cheap        339
expensive     97
Name: count, dtype: int64

### Create Feature Title

In [87]:
def create_title(data):
    if data in ['Mr', 'Miss', 'Mrs', 'Master']:
        return data
    else:
        return 'Other'

In [88]:
df_train['Title'] = df_train['Name'].apply(lambda x : x.split(", ")[1].split(".")[0])
df_train['Title'] = df_train['Title'].apply(create_title)
df_train['Title'].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

### Create Feature Is Alone

Passanger alone if SibSp and Parch = 0

In [89]:
df_train['Is Alone'] = (df_train['SibSp'] == 0) & (df_train['Parch'] == 0)
df_train['Is Alone'].head()

0    False
1    False
2     True
3    False
4     True
Name: Is Alone, dtype: bool

### Create Feature Total Peers (SibSp + Parch)

In [90]:
df_train['Total Peers'] = df_train['SibSp'] + df_train['Parch']
df_train['Total Peers'].head()

0    1
1    1
2    0
3    1
4    0
Name: Total Peers, dtype: int64

### Drop Unused Feature

- PassangerId : Because this feature are unique for each observation
- Cabin : Because this feature have more than 70% missing value.
- Name
- Ticket

In [91]:
df_train = df_train.drop(columns=['PassengerId', 'Cabin', 'Name', 'Ticket', 'Age', 'Fare Class'])
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age Class,Title,Is Alone,Total Peers
0,0,3,male,1,0,7.25,S,young adult,Mr,False,1
1,1,1,female,1,0,71.2833,C,old adult,Mrs,False,1
2,1,3,female,0,0,7.925,S,young adult,Miss,True,0
3,1,1,female,1,0,53.1,S,old adult,Mrs,False,1
4,0,3,male,0,0,8.05,S,old adult,Mr,True,0


### Encoding and Fill Missing Value Pipeline

In [92]:
def preprocessor_pipeline(num_cols, ohe_cols, ordinal_cols, ordinal_mappings):
    
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median"))
    ])

    ohe_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    ordinal_transformers = []
    for i, col in enumerate(ordinal_cols):
        mapping = ordinal_mappings[i]
        ordinal_pipeline = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OrdinalEncoder(categories=[mapping]))
                ])
        ordinal_transformers.append((col, ordinal_pipeline, [col]))

    transformer = [
        ("numeric_enc", numeric_pipeline, num_cols),
        ("ohe_enc", ohe_pipeline, ohe_cols)
    ] + ordinal_transformers

    preprocessor = ColumnTransformer(transformer)

    return preprocessor

In [93]:
num_cols = ['Fare', 'SibSp', 'Parch', 'Total Peers']
ohe_cols = ['Sex', 'Embarked', 'Title', 'Is Alone']
ordinal_cols = ['Pclass', 'Age Class']

ordinal_mappings = [np.sort(df_train['Pclass'].unique()).tolist(),
                    ['toddler', 'children', 'teenager', 'young adult', 'middle adult', 'old adult', 'retirement']]

In [94]:
preprocessor = preprocessor_pipeline(num_cols, ohe_cols, ordinal_cols, ordinal_mappings)

In [96]:
X = df_train.drop(columns=['Survived'])
y = df_train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((712, 10), (179, 10), (712,), (179,))

In [97]:
after_transform = preprocessor.fit_transform(X_train, y_train)

new_cat_cols = preprocessor.named_transformers_['ohe_enc'].named_steps['encoder'].get_feature_names_out(ohe_cols)
all_cols = np.concatenate([num_cols,  new_cat_cols, ordinal_cols])

temp_data = pd.DataFrame(after_transform, columns=all_cols)

In [98]:
temp_data.head()

Unnamed: 0,Fare,SibSp,Parch,Total Peers,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Is Alone_False,Is Alone_True,Pclass,Age Class
0,7.75,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0
1,31.275,4.0,2.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0
2,17.8,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,2.0
3,7.775,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,6.0
4,151.55,1.0,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0


### **Feature Selection**

In [99]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(temp_data, y_train)

feature_importance = model.feature_importances_

In [100]:
# Train Score
model.score(temp_data, y_train)

0.9578651685393258

In [102]:
after_transform = preprocessor.transform(X_val)

new_cat_cols = preprocessor.named_transformers_['ohe_enc'].named_steps['encoder'].get_feature_names_out(ohe_cols)
all_cols = np.concatenate([num_cols,  new_cat_cols, ordinal_cols])

temp_data_val = pd.DataFrame(after_transform, columns=all_cols)

# Test Score
model.score(temp_data_val, y_val)

0.8212290502793296

In [103]:
# Feature Importance
pd.DataFrame({"feature_name" : temp_data.columns, "feature_imp_score":feature_importance}).sort_values('feature_imp_score', ascending=False)

Unnamed: 0,feature_name,feature_imp_score
0,Fare,0.278612
17,Age Class,0.109268
4,Sex_female,0.099438
11,Title_Mr,0.097545
5,Sex_male,0.096489
16,Pclass,0.084493
3,Total Peers,0.051835
1,SibSp,0.03741
2,Parch,0.0267
10,Title_Miss,0.022717


If use *Age* feature it will get a huge gap of overfitting in training and testing data rather than without *Age*. This is because *Age* corralated with feature *Age Class*. However if only use *Age* feature it will also get a huge gap of overfitting. So based on experiment rather than use *Age*, we use *Age Class*.

Then, if we use *Fare Class* we also get a huge gap of overfitting rather than use *Fare*. So we prefer select *Fare*

### **Final Feature**

In [104]:
df_train.columns

Index(['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked',
       'Age Class', 'Title', 'Is Alone', 'Total Peers'],
      dtype='object')

In [105]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age Class,Title,Is Alone,Total Peers
0,0,3,male,1,0,7.25,S,young adult,Mr,False,1
1,1,1,female,1,0,71.2833,C,old adult,Mrs,False,1
2,1,3,female,0,0,7.925,S,young adult,Miss,True,0
3,1,1,female,1,0,53.1,S,old adult,Mrs,False,1
4,0,3,male,0,0,8.05,S,old adult,Mr,True,0


In [106]:
# Save processed train data
df_train.to_csv('data/data_processed/train_processed.csv', index = False)

In [107]:
# Preprocess Test data

df_test = pd.read_csv("data/test.csv")

df_test['Age Class'] = df_test['Age'].apply(age_binning)
df_test['Fare Class'] = df_test['Fare'].apply(fare_binning)
df_test['Title'] = df_test['Name'].apply(lambda x : x.split(", ")[1].split(".")[0])
df_test['Title'] = df_test['Title'].apply(create_title)
df_test['Is Alone'] = (df_test['SibSp'] == 0) & (df_test['Parch'] == 0)
df_test['Total Peers'] = df_test['SibSp'] + df_test['Parch']

df_test = df_test.drop(columns=['PassengerId', 'Cabin', 'Name', 'Ticket', 'Age', 'Fare Class'])
df_test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Age Class,Title,Is Alone,Total Peers
0,3,male,0,0,7.8292,Q,old adult,Mr,True,0
1,3,female,1,0,7.0,S,old adult,Mrs,False,1
2,2,male,0,0,9.6875,Q,retirement,Mr,True,0
3,3,male,0,0,8.6625,S,middle adult,Mr,True,0
4,3,female,1,1,12.2875,S,young adult,Mrs,False,2


In [108]:
# Save processed test data
df_test.to_csv('data/data_processed/test_processed.csv', index = False)