# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings("ignore")

sns.set()

# Importing Data

In [2]:
train_backup = pd.read_csv("../Data/preprocessed_train_df.csv")
train_df = train_backup.copy()

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family,Cabin_Names
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,1,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,1,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,0,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,1,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,0,


# Feature Engineering

**Features DataFrame**

In [4]:
train_features_df = pd.DataFrame(index = train_df.index)

**Targets DataFrame**

In [5]:
train_targets_df = train_df["Survived"]

## Categorical Feature Encoding

### Sex

**One Hot Encoding**

In [6]:
train_features_df["Sex:female"] = pd.get_dummies(train_df["Sex"], prefix = "Sex", prefix_sep = ":")["Sex:female"]
train_features_df["Sex:male"] = pd.get_dummies(train_df["Sex"], prefix = "Sex", prefix_sep = ":")["Sex:male"]

**Mean Encoding**

In [7]:
train_df.groupby(by = "Sex")["Survived"].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [8]:
train_features_df["Sex_mean_enc"] = train_df["Sex"].map(train_df.groupby(by = "Sex")["Survived"].mean())

**Feature Selection**

In [9]:
train_features_df.columns

Index(['Sex:female', 'Sex:male', 'Sex_mean_enc'], dtype='object')

In [10]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
fs.scores_

array([170.34812709,  92.70244698,  47.85148767])

In [11]:
train_features_df = train_features_df.drop(["Sex:male", "Sex_mean_enc"], axis = 1)

### Pclass

**One Hot Encoding**

In [12]:
train_features_df["Pclass:1"] = pd.get_dummies(train_df["Pclass"], prefix = "Pclass", prefix_sep = ":")["Pclass:1"]
train_features_df["Pclass:2"] = pd.get_dummies(train_df["Pclass"], prefix = "Pclass", prefix_sep = ":")["Pclass:2"]
train_features_df["Pclass:3"] = pd.get_dummies(train_df["Pclass"], prefix = "Pclass", prefix_sep = ":")["Pclass:3"]

**Mean Encoding**

In [13]:
train_df.groupby(by = "Pclass")["Survived"].mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [14]:
train_features_df["Pclass_mean_enc"] = train_df["Pclass"].map(train_df.groupby(by = "Pclass")["Survived"].mean())

**Feature Selection**

In [15]:
train_features_df.columns

Index(['Sex:female', 'Pclass:1', 'Pclass:2', 'Pclass:3', 'Pclass_mean_enc'], dtype='object')

In [16]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
fs.scores_

array([170.34812709,  55.17515099,   6.16076687,  41.55307089,
         7.3207358 ])

In [17]:
train_features_df = train_features_df.drop(["Pclass:2", "Pclass_mean_enc"], axis = 1)

### Embarked

**One Hot Encoding**

In [18]:
train_features_df["Embarked:Q"] = pd.get_dummies(train_df["Embarked"], prefix = "Embarked", prefix_sep = ":")["Embarked:Q"]
train_features_df["Embarked:S"] = pd.get_dummies(train_df["Embarked"], prefix = "Embarked", prefix_sep = ":")["Embarked:S"]
train_features_df["Embarked:C"] = pd.get_dummies(train_df["Embarked"], prefix = "Embarked", prefix_sep = ":")["Embarked:C"]

**Mean Encoding**

In [19]:
train_df.groupby(by = "Embarked")["Survived"].mean()

Embarked
C    0.553571
Q    0.389610
S    0.339009
Name: Survived, dtype: float64

In [20]:
train_features_df["Embarked_mean_enc"] = train_df["Embarked"].map(train_df.groupby(by = "Embarked")["Survived"].mean())

**Feature Selection**

In [21]:
train_features_df.columns

Index(['Sex:female', 'Pclass:1', 'Pclass:3', 'Embarked:Q', 'Embarked:S',
       'Embarked:C', 'Embarked_mean_enc'],
      dtype='object')

In [22]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
fs.scores_

array([1.70348127e+02, 5.51751510e+01, 4.15530709e+01, 1.08467891e-02,
       5.48920482e+00, 2.04644013e+01, 4.66203240e-01])

In [23]:
train_features_df = train_features_df.drop(["Embarked:Q", "Embarked_mean_enc"], axis = 1)

### Cabin_Names

**Filling Null Values**

In [24]:
train_df["Cabin_Names"] = train_df["Cabin_Names"].fillna("Missing")

**One Hot Encoding**

In [25]:
train_features_df = pd.concat(
                              [train_features_df, 
                               pd.get_dummies(train_df["Cabin_Names"], prefix = "Cabin_Names", prefix_sep = ":")],
                              axis = 1)

**Mean Encoding**

In [26]:
train_df.groupby(by = "Cabin_Names")["Survived"].mean()

Cabin_Names
A          0.466667
B          0.744681
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
Missing    0.299854
T          0.000000
Name: Survived, dtype: float64

In [27]:
train_features_df["Cabin_Names_mean_enc"] = train_df["Cabin_Names"].map(train_df.groupby(by = "Cabin_Names")["Survived"].mean())

**Label Encoding**

In [28]:
train_df["Cabin_Names"]

0      Missing
1            C
2      Missing
3            C
4      Missing
        ...   
886    Missing
887          B
888    Missing
889          C
890    Missing
Name: Cabin_Names, Length: 891, dtype: object

In [29]:
train_features_df["Cabin_Names_label_enc"] = train_df["Cabin_Names"].map({"Missing":0,
                                                                        "A": 1, "B": 2, "C": 3, "D": 4,
                                                                        "E": 5, "F": 6, "G": 7, "T": 8})

**Feature Selection**

In [30]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
scores = fs.scores_

for i, score in enumerate(scores):
    print(f"{train_features_df.columns[i]}: {score}")

Sex:female: 170.34812709177143
Pclass:1: 55.17515099223469
Pclass:3: 41.553070892984856
Embarked:S: 5.489204823203513
Embarked:C: 20.46440126956736
Cabin_Names:A: 0.43511647972390016
Cabin_Names:B: 25.875580562847652
Cabin_Names:C: 10.93672950088475
Cabin_Names:D: 19.48964624676445
Cabin_Names:E: 18.14063848144953
Cabin_Names:F: 2.9469701997743414
Cabin_Names:G: 0.22821397756686804
Cabin_Names:Missing: 20.4883143501965
Cabin_Names:T: 0.6229508196721312
Cabin_Names_mean_enc: 6.800274074369643
Cabin_Names_label_enc: 240.47140498835452


In [31]:
train_features_df = train_features_df[[
    "Sex:female", "Pclass:1", "Pclass:3",
    "Embarked:S", "Embarked:C", "Cabin_Names_label_enc"
]]

In [32]:
train_features_df.head()

Unnamed: 0,Sex:female,Pclass:1,Pclass:3,Embarked:S,Embarked:C,Cabin_Names_label_enc
0,0,0,1,1,0,0
1,1,1,0,0,1,3
2,1,0,1,1,0,0
3,1,1,0,1,0,3
4,0,0,1,1,0,0


### Title

**One Hot Encoding**

In [33]:
train_features_df = pd.concat(
                              [
                                  train_features_df,
                                  pd.get_dummies(train_df["Title"], prefix = "Title", prefix_sep = ":")
                              ], axis = 1)

**Mean Encoding**

In [34]:
train_df.groupby(by = "Title")["Survived"].mean()

Title
Master.    0.575000
Miss.      0.699454
Mr.        0.156673
Mrs.       0.792000
Rare.      0.423077
Name: Survived, dtype: float64

In [35]:
train_features_df["Title_mean_enc"] = train_df["Title"].map(train_df.groupby(by = "Title")["Survived"].mean())

**Label Encoding**

In [36]:
train_features_df["Title_label_enc"] = train_df["Title"].map(
                                                            {
                                                                "Mr.": 0, "Rare.": 1, "Master.": 2,
                                                                "Miss.":3, "Mrs.":4
                                                            })

**Feature Selection**

In [37]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
scores = fs.scores_

for i, score in enumerate(scores):
    print(f"{train_features_df.columns[i]}: {score}")

Sex:female: 170.34812709177143
Pclass:1: 55.17515099223469
Pclass:3: 41.553070892984856
Embarked:S: 5.489204823203513
Embarked:C: 20.46440126956736
Cabin_Names_label_enc: 240.47140498835452
Title:Master.: 6.180424935289041
Title:Miss.: 77.07681862544027
Title:Mr.: 112.80578451710022
Title:Mrs.: 88.05039171699741
Title:Rare.: 0.1692606358266412
Title_mean_enc: 55.887893366471445
Title_label_enc: 567.3254406508074


In [38]:
train_features_df = train_features_df.drop(
                                          [
                                              "Title:Master.", "Title:Miss.", "Title:Mr.",
                                              "Title:Mrs.", "Title:Rare.", "Title_mean_enc"
                                          ], axis = 1)

In [39]:
train_features_df.head()

Unnamed: 0,Sex:female,Pclass:1,Pclass:3,Embarked:S,Embarked:C,Cabin_Names_label_enc,Title_label_enc
0,0,0,1,1,0,0,0
1,1,1,0,0,1,3,4
2,1,0,1,1,0,0,3
3,1,1,0,1,0,3,4
4,0,0,1,1,0,0,0


# Continuous Feature Encoding

## Family

**Building Labels for Family sizes**

In [40]:
train_df["Family"].value_counts()

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: Family, dtype: int64

In [41]:
family_df = train_df[["Survived"]]

family_df["Family_labels"] = np.where(train_df["Family"] == 0, "No", train_df["Family"])
family_df["Family_labels"] = np.where(train_df["Family"].isin([1, 2, 3]), "Small", family_df["Family_labels"])
family_df["Family_labels"] = np.where(train_df["Family"] >= 4, "Large", family_df["Family_labels"])

In [42]:
family_df["Family_labels"].value_counts()

No       537
Small    292
Large     62
Name: Family_labels, dtype: int64

In [43]:
family_df.groupby(by = "Family_labels")["Survived"].mean()

Family_labels
Large    0.161290
No       0.303538
Small    0.578767
Name: Survived, dtype: float64

**One Hot Encoding**

In [44]:
train_features_df = pd.concat([
                                train_features_df,
                                pd.get_dummies(family_df["Family_labels"], prefix = "Family", prefix_sep = ":")
                                ], axis = 1)

**Mean Encoding**

In [45]:
train_features_df["Family_mean_enc"] = family_df["Family_labels"].map(family_df.groupby(by = "Family_labels")["Survived"].mean())

**Label Encoding**

In [46]:
train_features_df["Family_label_enc"] = family_df["Family_labels"].map({
                                                                        "Large": 0, "No": 1, "Small": 2
                                                                        })

**Feature Selection**

In [47]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
scores = fs.scores_

for i, score in enumerate(scores):
    print(f"{train_features_df.columns[i]}: {score}")

Sex:female: 170.34812709177143
Pclass:1: 55.17515099223469
Pclass:3: 41.553070892984856
Embarked:S: 5.489204823203513
Embarked:C: 20.46440126956736
Cabin_Names_label_enc: 240.47140498835452
Title_label_enc: 567.3254406508074
Family:Large: 12.983634390047037
Family:No: 14.640792727307776
Family:Small: 46.91282192962758
Family_mean_enc: 3.8420556998152637
Family_label_enc: 18.86258071070836


In [48]:
train_features_df = train_features_df.drop([
                                            "Family_mean_enc", "Family_label_enc", "Family:Large"
                                            ], axis = 1)

In [49]:
train_features_df.head()

Unnamed: 0,Sex:female,Pclass:1,Pclass:3,Embarked:S,Embarked:C,Cabin_Names_label_enc,Title_label_enc,Family:No,Family:Small
0,0,0,1,1,0,0,0,0,1
1,1,1,0,0,1,3,4,0,1
2,1,0,1,1,0,0,3,1,0
3,1,1,0,1,0,3,4,0,1
4,0,0,1,1,0,0,0,1,0


## Age

**Building Age Labels**

In [50]:
age_df = train_df[["Survived"]]

age_df["Age_labels"] = np.where(train_df["Age"]<=10, "Infant", train_df["Age"])
age_df["Age_labels"] = np.where(((train_df["Age"]>10) & (train_df["Age"]<=18)), "Kid", age_df["Age_labels"])
age_df["Age_labels"] = np.where(((train_df["Age"]>18) & (train_df["Age"]<=25)), "Young_Adult", age_df["Age_labels"])
age_df["Age_labels"] = np.where(((train_df["Age"]>25) & (train_df["Age"]<=40)), "Adult", age_df["Age_labels"])
age_df["Age_labels"] = np.where(((train_df["Age"]>40) & (train_df["Age"]<=60)), "Old_Adult", age_df["Age_labels"])
age_df["Age_labels"] = np.where(train_df["Age"]>60, "Old", age_df["Age_labels"])

In [51]:
age_df["Age_labels"].value_counts()

Adult          399
Young_Adult    198
Old_Adult      129
Kid             75
Infant          68
Old             22
Name: Age_labels, dtype: int64

In [52]:
age_df.groupby(by = "Age_labels")["Survived"].mean()

Age_labels
Adult          0.348371
Infant         0.588235
Kid            0.426667
Old            0.227273
Old_Adult      0.387597
Young_Adult    0.383838
Name: Survived, dtype: float64

**One Hot Encoding**

In [53]:
train_features_df = pd.concat([
                                train_features_df,
                                pd.get_dummies(age_df["Age_labels"], prefix = "Age", prefix_sep = ":")
                            ], axis = 1)

**Mean Encoding**

In [54]:
train_features_df["Age_mean_enc"] = age_df["Age_labels"].map(age_df.groupby(by = "Age_labels")["Survived"].mean())

**Label Encoding**

In [55]:
train_features_df["Age_label_enc"] = age_df["Age_labels"].map({
                                                                "Old": 0, "Adult": 1, "Young_Adult": 2,
                                                                "Old_Adult": 3, "Kid": 4, "Infant": 5
                                                            })

**Adding Raw Age Feature**

In [56]:
train_features_df["Age"] = train_df["Age"]

**Feature Selection**

In [57]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
scores = fs.scores_

for i, score in enumerate(scores):
    print(f"{train_features_df.columns[i]}: {score}")

Sex:female: 170.34812709177143
Pclass:1: 55.17515099223469
Pclass:3: 41.553070892984856
Embarked:S: 5.489204823203513
Embarked:C: 20.46440126956736
Cabin_Names_label_enc: 240.47140498835452
Title_label_enc: 567.3254406508074
Family:No: 14.640792727307776
Family:Small: 46.91282192962758
Age:Adult: 2.1222177531836457
Age:Infant: 12.011977871390144
Age:Kid: 0.5816738567730807
Age:Old: 2.280198446937015
Age:Old_Adult: 0.0077051186869194835
Age:Young_Adult: 0.0
Age_mean_enc: 0.19994361309850825
Age_label_enc: 11.256587116150138
Age: 33.01939248021836


In [58]:
train_features_df = train_features_df.drop([
    "Age:Adult", "Age:Infant", "Age:Kid", "Age:Old", "Age:Old_Adult", "Age:Young_Adult", "Age_mean_enc"
], axis = 1)

In [59]:
train_features_df.head()

Unnamed: 0,Sex:female,Pclass:1,Pclass:3,Embarked:S,Embarked:C,Cabin_Names_label_enc,Title_label_enc,Family:No,Family:Small,Age_label_enc,Age
0,0,0,1,1,0,0,0,0,1,2,22.0
1,1,1,0,0,1,3,4,0,1,1,38.0
2,1,0,1,1,0,0,3,1,0,1,26.0
3,1,1,0,1,0,3,4,0,1,1,35.0
4,0,0,1,1,0,0,0,1,0,1,35.0


## Fare

**Building Fare Labels**

In [60]:
fare_df = train_df[["Survived"]]

fare_df["Fare_labels"] = np.where(train_df["Fare"]<50, "Cheap", train_df["Fare"])
fare_df["Fare_labels"] = np.where(((train_df["Fare"]>=50) & (train_df["Fare"]<100)), "Mid", fare_df["Fare_labels"])
fare_df["Fare_labels"] = np.where(train_df["Fare"]>=100, "Costly", fare_df["Fare_labels"])

In [61]:
fare_df["Fare_labels"].value_counts()

Cheap     730
Mid       108
Costly     53
Name: Fare_labels, dtype: int64

In [62]:
fare_df.groupby(by = "Fare_labels")["Survived"].mean()

Fare_labels
Cheap     0.319178
Costly    0.735849
Mid       0.648148
Name: Survived, dtype: float64

**One Hot Encoding**

In [63]:
train_features_df = pd.concat([
    train_features_df,
    pd.get_dummies(fare_df["Fare_labels"], prefix = "Fare", prefix_sep = ":")
], axis = 1)

**Mean Encoding**

In [64]:
train_features_df["Fare_mean_enc"] = fare_df["Fare_labels"].map(fare_df.groupby(by = "Fare_labels")["Survived"].mean())

**Label Encoding**

In [65]:
train_features_df["Fare_label_enc"] = fare_df["Fare_labels"].map({"Cheap": 0, "Mid": 1, "Costly": 2})

**Adding raw Fare feature**

In [66]:
train_features_df["Fare"] = train_df["Fare"]

**Feature Selection**

In [67]:
fs = SelectKBest(score_func = chi2, k = "all")
fs.fit(train_features_df, train_targets_df)
scores = fs.scores_

for i, score in enumerate(scores):
    print(f"{train_features_df.columns[i]}: {score}")

Sex:female: 170.34812709177143
Pclass:1: 55.17515099223469
Pclass:3: 41.553070892984856
Embarked:S: 5.489204823203513
Embarked:C: 20.46440126956736
Cabin_Names_label_enc: 240.47140498835452
Title_label_enc: 567.3254406508074
Family:No: 14.640792727307776
Family:Small: 46.91282192962758
Age_label_enc: 11.256587116150138
Age: 33.01939248021836
Fare:Cheap: 12.904918623754538
Fare:Costly: 27.76799290214401
Fare:Mid: 31.90120793787748
Fare_mean_enc: 3.6423400307507308
Fare_label_enc: 85.69746720101926
Fare: 4518.3190906386335


In [68]:
train_features_df = train_features_df.drop([
    "Fare:Cheap", "Fare:Costly", "Fare:Mid", "Fare_mean_enc"
], axis = 1)

In [69]:
train_features_df.head()

Unnamed: 0,Sex:female,Pclass:1,Pclass:3,Embarked:S,Embarked:C,Cabin_Names_label_enc,Title_label_enc,Family:No,Family:Small,Age_label_enc,Age,Fare_label_enc,Fare
0,0,0,1,1,0,0,0,0,1,2,22.0,0,7.25
1,1,1,0,0,1,3,4,0,1,1,38.0,1,71.2833
2,1,0,1,1,0,0,3,1,0,1,26.0,0,7.925
3,1,1,0,1,0,3,4,0,1,1,35.0,1,53.1
4,0,0,1,1,0,0,0,1,0,1,35.0,0,8.05


In [70]:
train_features_df.columns

Index(['Sex:female', 'Pclass:1', 'Pclass:3', 'Embarked:S', 'Embarked:C',
       'Cabin_Names_label_enc', 'Title_label_enc', 'Family:No', 'Family:Small',
       'Age_label_enc', 'Age', 'Fare_label_enc', 'Fare'],
      dtype='object')

# Exporting Train Set after Feature Engineering

In [71]:
train_features_df.to_csv("../Data/train_features_df.csv", index = False)