# Feature Engineering 

In [2]:
import pandas as pd
df = pd.read_csv('titanicDataset.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
df = df.drop(['Age', 'Cabin', 'Name', 'PassengerId', 'Ticket'], axis = 1)
df.reset_index()
df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,male,0,0,7.8292,Q
1,1,3,female,1,0,7.0,S
2,0,2,male,0,0,9.6875,Q
3,0,3,male,0,0,8.6625,S
4,1,3,female,1,1,12.2875,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 23.0+ KB


In [6]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        1
Embarked    0
dtype: int64

In [7]:
df['Embarked'].value_counts()

Embarked
S    270
C    102
Q     46
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [9]:
transformer = ColumnTransformer(transformers = [('OHE1', OneHotEncoder(sparse_output = False, drop = 'first'), ['Sex']),('impute', SimpleImputer(), ['Fare']), ('OHE2', OneHotEncoder(sparse_output=False, drop='first'), ['Embarked']),], remainder='passthrough')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:],df.iloc[:,0], test_size  = 0.2, random_state = 11)

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
214,3,female,4,2,7.775,S
186,2,female,0,1,21.0,S
315,3,female,0,0,7.7333,Q
356,1,female,2,0,51.4792,S
88,3,female,0,0,7.75,Q


In [12]:
y_train.sample(10)

208    1
176    1
33     1
184    1
115    0
324    1
37     1
416    0
34     0
217    0
Name: Survived, dtype: int64

In [13]:
X_train[X_train['Fare'].isnull()].index

Index([152], dtype='int64')

In [14]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [15]:
X_train #female - 0 , male - 1

array([[ 0.    ,  7.775 ,  0.    , ...,  3.    ,  4.    ,  2.    ],
       [ 0.    , 21.    ,  0.    , ...,  2.    ,  0.    ,  1.    ],
       [ 0.    ,  7.7333,  1.    , ...,  3.    ,  0.    ,  0.    ],
       ...,
       [ 1.    , 15.2458,  0.    , ...,  3.    ,  1.    ,  1.    ],
       [ 1.    , 26.    ,  0.    , ...,  1.    ,  0.    ,  0.    ],
       [ 0.    , 13.775 ,  0.    , ...,  3.    ,  1.    ,  1.    ]])

In [16]:
X_train = pd.DataFrame(X_train, columns = transformer.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = transformer.get_feature_names_out())

In [17]:
X_train

Unnamed: 0,OHE1__Sex_male,impute__Fare,OHE2__Embarked_Q,OHE2__Embarked_S,remainder__Pclass,remainder__SibSp,remainder__Parch
0,0.0,7.7750,0.0,1.0,3.0,4.0,2.0
1,0.0,21.0000,0.0,1.0,2.0,0.0,1.0
2,0.0,7.7333,1.0,0.0,3.0,0.0,0.0
3,0.0,51.4792,0.0,1.0,1.0,2.0,0.0
4,0.0,7.7500,1.0,0.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...
329,1.0,7.2292,0.0,0.0,3.0,0.0,2.0
330,1.0,7.7750,0.0,1.0,3.0,0.0,0.0
331,1.0,15.2458,0.0,0.0,3.0,1.0,1.0
332,1.0,26.0000,0.0,1.0,1.0,0.0,0.0


In [18]:
print("The Fare value at 152 index : ",X_train.loc[152, 'impute__Fare'])

The Fare value at 152 index :  7.7958


In [19]:
scaling = StandardScaler()
X_train['impute__Fare'] = scaling.fit_transform(X_train[['impute__Fare']])
X_test['impute__Fare'] = scaling.transform(X_test[['impute__Fare']])

In [20]:
X_train

Unnamed: 0,OHE1__Sex_male,impute__Fare,OHE2__Embarked_Q,OHE2__Embarked_S,remainder__Pclass,remainder__SibSp,remainder__Parch
0,0.0,-0.490684,0.0,1.0,3.0,4.0,2.0
1,0.0,-0.268384,0.0,1.0,2.0,0.0,1.0
2,0.0,-0.491384,1.0,0.0,3.0,0.0,0.0
3,0.0,0.243941,0.0,1.0,1.0,2.0,0.0
4,0.0,-0.491104,1.0,0.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...
329,1.0,-0.499858,0.0,0.0,3.0,0.0,2.0
330,1.0,-0.490684,0.0,1.0,3.0,0.0,0.0
331,1.0,-0.365107,0.0,0.0,3.0,1.0,1.0
332,1.0,-0.184339,0.0,1.0,1.0,0.0,0.0


In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OHE1__Sex_male     334 non-null    float64
 1   impute__Fare       334 non-null    float64
 2   OHE2__Embarked_Q   334 non-null    float64
 3   OHE2__Embarked_S   334 non-null    float64
 4   remainder__Pclass  334 non-null    float64
 5   remainder__SibSp   334 non-null    float64
 6   remainder__Parch   334 non-null    float64
dtypes: float64(7)
memory usage: 18.4 KB


In [22]:
X_train = X_train.astype({
  'OHE1__Sex_male' : 'int',
  'OHE2__Embarked_Q' : 'int',
  'OHE2__Embarked_S' : 'int',
  'remainder__Pclass' : 'int',
  'remainder__SibSp' : 'int',
  'remainder__Parch' : 'int'
})

X_test = X_test.astype({
  'OHE1__Sex_male' : 'int',
  'OHE2__Embarked_Q' : 'int',
  'OHE2__Embarked_S' : 'int',
  'remainder__Pclass' : 'int',
  'remainder__SibSp' : 'int',
  'remainder__Parch' : 'int'
})

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OHE1__Sex_male     334 non-null    int64  
 1   impute__Fare       334 non-null    float64
 2   OHE2__Embarked_Q   334 non-null    int64  
 3   OHE2__Embarked_S   334 non-null    int64  
 4   remainder__Pclass  334 non-null    int64  
 5   remainder__SibSp   334 non-null    int64  
 6   remainder__Parch   334 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 18.4 KB


In [24]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OHE1__Sex_male     84 non-null     int64  
 1   impute__Fare       84 non-null     float64
 2   OHE2__Embarked_Q   84 non-null     int64  
 3   OHE2__Embarked_S   84 non-null     int64  
 4   remainder__Pclass  84 non-null     int64  
 5   remainder__SibSp   84 non-null     int64  
 6   remainder__Parch   84 non-null     int64  
dtypes: float64(1), int64(6)
memory usage: 4.7 KB


In [25]:
X_train

Unnamed: 0,OHE1__Sex_male,impute__Fare,OHE2__Embarked_Q,OHE2__Embarked_S,remainder__Pclass,remainder__SibSp,remainder__Parch
0,0,-0.490684,0,1,3,4,2
1,0,-0.268384,0,1,2,0,1
2,0,-0.491384,1,0,3,0,0
3,0,0.243941,0,1,1,2,0
4,0,-0.491104,1,0,3,0,0
...,...,...,...,...,...,...,...
329,1,-0.499858,0,0,3,0,2
330,1,-0.490684,0,1,3,0,0
331,1,-0.365107,0,0,3,1,1
332,1,-0.184339,0,1,1,0,0


In [26]:
X_test

Unnamed: 0,OHE1__Sex_male,impute__Fare,OHE2__Embarked_Q,OHE2__Embarked_S,remainder__Pclass,remainder__SibSp,remainder__Parch
0,0,-0.234766,0,1,2,0,2
1,0,-0.499858,0,0,3,0,0
2,1,-0.280991,0,1,3,0,2
3,0,2.933733,0,0,1,0,0
4,1,-0.499929,0,0,3,0,0
...,...,...,...,...,...,...,...
79,0,-0.251155,0,1,3,1,1
80,1,-0.175094,0,1,1,0,0
81,1,-0.413782,1,0,2,0,0
82,0,0.547693,0,1,3,8,2
