In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('titanic_toy.csv')

In [3]:
df.sample()

Unnamed: 0,Age,Fare,Family,Survived
616,34.0,14.4,2,0


In [5]:
df.isnull().mean()

Age         0.198653
Fare        0.050505
Family      0.000000
Survived    0.000000
dtype: float64

In [6]:
X = df.drop(columns=['Survived'])
y = df['Survived']


In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
X_train

Unnamed: 0,Age,Fare,Family
331,45.5,28.5000,0
733,23.0,13.0000,0
382,32.0,7.9250,0
704,26.0,7.8542,1
813,6.0,31.2750,6
...,...,...,...
106,21.0,7.6500,0
270,,31.0000,0
860,41.0,,2
435,14.0,120.0000,3


In [13]:
X_train['Age_99'] = X_train['Age'].fillna(99)
X_train['Age_minus1'] = X_train['Age'].fillna(-1)

X_train['Fare_999'] = X_train['Fare'].fillna(999)
X_train['Fare_minus1'] = X_train['Fare'].fillna(-1)


In [14]:
print("Original Age variable variance: ", X_train['Age'].var())
print("Age with 99 variance: ", X_train['Age_99'].var())
print("Age with -1 variance: ", X_train['Age_minus1'].var())

print("Original Fare variable variance: ", X_train['Fare'].var())
print("Fare with 999 variance: ", X_train['Fare_999'].var())
print("Fare with -1 variance: ", X_train['Fare_minus1'].var())

Original Age variable variance:  210.2517072477435
Age with 99 variance:  932.9665366701432
Age with -1 variance:  315.9955036260055
Original Fare variable variance:  2761.0314349486343
Fare with 999 variance:  47525.470595360035
Fare with -1 variance:  2675.2394049177024


In [15]:
X_train.cov()

Unnamed: 0,Age,Fare,Family,Age_99,Age_minus1,Fare_999,Fare_minus1
Age,210.251707,75.481375,-6.993325,210.251707,210.251707,130.495777,69.137162
Fare,75.481375,2761.031435,18.599163,-111.965046,135.785804,2761.031435,2761.031435
Family,-6.993325,18.599163,2.830892,-6.941982,-5.034556,16.878492,17.684447
Age_99,210.251707,-111.965046,-6.941982,932.966537,-166.460682,-255.051165,-101.13707
Age_minus1,210.251707,135.785804,-5.034556,-166.460682,315.995504,262.712284,124.270305
Fare_999,130.495777,2761.031435,16.878492,-255.051165,262.712284,47525.470595,1063.944817
Fare_minus1,69.137162,2761.031435,17.684447,-101.13707,124.270305,1063.944817,2675.239405


In [17]:
X_train.corr()

Unnamed: 0,Age,Fare,Family,Age_99,Age_minus1,Fare_999,Fare_minus1
Age,1.0,0.091482,-0.319651,1.0,1.0,0.039967,0.085422
Fare,0.091482,1.0,0.207193,-0.069365,0.145071,1.0,1.0
Family,-0.319651,0.207193,1.0,-0.135079,-0.168329,0.046016,0.203212
Age_99,1.0,-0.069365,-0.135079,1.0,-0.306576,-0.038303,-0.064017
Age_minus1,1.0,0.145071,-0.168329,-0.306576,1.0,0.067792,0.135159
Fare_999,0.039967,1.0,0.046016,-0.038303,0.067792,1.0,0.094357
Fare_minus1,0.085422,1.0,0.203212,-0.064017,0.135159,0.094357,1.0


# Using SKLEARN

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
imputer1 = SimpleImputer(strategy='constant', fill_value=99)
imputer2 = SimpleImputer(strategy='constant',fill_value=999)

In [21]:
trf = ColumnTransformer([
    ('imputer1', imputer1, ['Age']),
    ('imputer2', imputer2, ['Fare'])
], remainder='passthrough')

In [22]:
trf.fit(X_train)

0,1,2
,transformers,"[('imputer1', ...), ('imputer2', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,99
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,999
,copy,True
,add_indicator,False
,keep_empty_features,False


In [23]:
trf.named_transformers_['imputer1'].statistics_

array([99.])

In [24]:
trf.named_transformers_['imputer2'].statistics_

array([999.])

In [25]:
X_train_trf = trf.transform(X_train)
X_test_trf = trf.transform(X_test)

In [26]:
X_train

Unnamed: 0,Age,Fare,Family
331,45.5,28.5000,0
733,23.0,13.0000,0
382,32.0,7.9250,0
704,26.0,7.8542,1
813,6.0,31.2750,6
...,...,...,...
106,21.0,7.6500,0
270,,31.0000,0
860,41.0,,2
435,14.0,120.0000,3
