In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

# plt.style.use('seaborn-colorblind')
# %matplotlib inline
#from feature_cleaning import rare_values as ra

## Load Dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('titanic.csv', usecols=use_cols)


In [3]:
data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925


In [4]:
survived = data['Survived']
survived

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [5]:
data.pop('Survived')

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [6]:
survived

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [7]:
# Note that we include target variable in the X_train 
# because we need it to supervise our discretization
# this is not the standard way of using train-test-split
X_train, X_test, y_train, y_test = train_test_split(data, survived, test_size=0.3,
                                                    random_state=124)
X_train.shape, X_test.shape

((623, 5), (268, 5))

In [8]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
240,3,female,,1,14.4542
129,3,male,45.0,0,6.9750
386,3,male,1.0,5,46.9000
171,3,male,4.0,4,29.1250
659,1,male,58.0,0,113.2750
...,...,...,...,...,...
681,1,male,27.0,0,76.7292
135,2,male,23.0,0,15.0458
17,2,male,,0,13.0000
668,3,male,43.0,0,8.0500


In [9]:
y_train.head()

240    0
129    0
386    0
171    0
659    0
Name: Survived, dtype: int64

In [10]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
781,1,female,17.0,1,57.0
725,3,male,20.0,0,8.6625
138,3,male,16.0,0,9.2167
47,3,female,,0,7.75
871,1,female,47.0,1,52.5542


In [11]:
y_test.head()

781    1
725    0
138    0
47     1
871    1
Name: Survived, dtype: int64

## Normalization - Standardization (Z-score scaling)

removes the mean and scales the data to unit variance.<br />z = (X - X.mean) /  std

In [12]:
from sklearn.preprocessing import StandardScaler
X_train_copy = X_train.copy(deep=True)
ss = StandardScaler().fit(X_train[['Fare']])
X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[["Fare"]])
X_train_copy

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Fare_zscore
240,3,female,,1,14.4542,-0.365857
129,3,male,45.0,0,6.9750,-0.512691
386,3,male,1.0,5,46.9000,0.271128
171,3,male,4.0,4,29.1250,-0.077836
659,1,male,58.0,0,113.2750,1.574220
...,...,...,...,...,...,...
681,1,male,27.0,0,76.7292,0.856743
135,2,male,23.0,0,15.0458,-0.354243
17,2,male,,0,13.0000,-0.394406
668,3,male,43.0,0,8.0500,-0.491586


In [13]:
# add the new created feature
from sklearn.preprocessing import StandardScaler
ss = StandardScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Pclass     Sex   Age  SibSp      Fare  Fare_zscore
240       3  female   NaN      1   14.4542    -0.365857
129       3    male  45.0      0    6.9750    -0.512691
386       3    male   1.0      5   46.9000     0.271128
171       3    male   4.0      4   29.1250    -0.077836
659       1    male  58.0      0  113.2750     1.574220
534       3  female  30.0      0    8.6625    -0.479561


In [14]:
# check if it is with mean=0 std=1
print(X_train_copy['Fare_zscore'].mean())
print(X_train_copy['Fare_zscore'].std())


-2.7287785015365504e-17
1.0008035356861011


## Min-Max scaling
transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min)

In [15]:
# add the new created feature
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Pclass     Sex   Age  SibSp      Fare  Fare_minmax
240       3  female   NaN      1   14.4542     0.028213
129       3    male  45.0      0    6.9750     0.013614
386       3    male   1.0      5   46.9000     0.091543
171       3    male   4.0      4   29.1250     0.056848
659       1    male  58.0      0  113.2750     0.221098
534       3  female  30.0      0    8.6625     0.016908


In [16]:
# check the range of Fare_minmax
print(X_train_copy['Fare_minmax'].max())
print(X_train_copy['Fare_minmax'].min())

1.0
0.0


## Robust scaling
removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR

In [17]:
# add the new created feature
from sklearn.preprocessing import RobustScaler
rs = RobustScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Pclass     Sex   Age  SibSp      Fare  Fare_robust
240       3  female   NaN      1   14.4542    -0.001959
129       3    male  45.0      0    6.9750    -0.321867
386       3    male   1.0      5   46.9000     1.385847
171       3    male   4.0      4   29.1250     0.625556
659       1    male  58.0      0  113.2750     4.224909
534       3  female  30.0      0    8.6625    -0.249688
