In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [2]:
file_path = 'data/titanic3.xls'

In [3]:
all_df = pd.read_excel(file_path)

In [4]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [5]:
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

In [6]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [7]:
df = all_df.drop(['name'], axis=1)

In [8]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [9]:
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [10]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [11]:
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)

In [12]:
x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])

In [13]:
x_one_hot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


In [14]:
ndarray = x_one_hot_df.values

In [15]:
ndarray.shape

(1309, 10)

In [16]:
ndarray[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ]])

In [17]:
label = ndarray[:,0]
features = ndarray[:,1:]

In [18]:
label[:2]

array([1., 1.])

In [19]:
features[:2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ]])

In [20]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [21]:
scaled_features = minmax_scale.fit_transform(features)

In [22]:
scaled_features[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [23]:
msk = np.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [24]:
print('total:', len(all_df), 'train:', len(train_df), 'test:', len(test_df))

total: 1309 train: 1054 test: 255


In [25]:
def preprocess_data(raw_df):
    df = raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)    
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
    x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])
    ndarray = x_one_hot_df.values
    label = ndarray[:,0]
    features = ndarray[:,1:]
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaled_features = minmax_scale.fit_transform(features)
    
    return scaled_features, label

In [26]:
train_feature, train_label = preprocess_data(train_df)
test_feature, test_label = preprocess_data(test_df)

In [27]:
train_feature[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [28]:
train_label[:2]

array([1., 1.])