In [2]:
import urllib.request
import os

In [8]:
url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath = "data/titanic3.xls"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url,filepath)
    print("downloaded:", result)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import cifar10
from keras.utils import np_utils
np.set_printoptions(threshold=np.inf) #show all array
np.random.seed(10)

Using TensorFlow backend.


In [9]:
all_df = pd.read_excel(filepath)

In [10]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [11]:
cols = ["survived", "name", "pclass", "sex", "age", "sibsp",
       "parch", "fare", "embarked"]
all_df = all_df[cols]

In [12]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [13]:
df = all_df.drop(["name"], axis = 1)

In [14]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [15]:
age_mean = df["age"].mean()
df["age"] = df["age"].fillna(age_mean)

In [16]:
fare_mean = df["fare"].mean()
df["fare"] = df["fare"].fillna(age_mean)

In [17]:
df["sex"] = df["sex"].map({"female":0, "male":0}).astype(int)

In [18]:
x_OneHot_df = pd.get_dummies(data = df, columns = ["embarked"])

In [19]:
x_OneHot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,0,0.9167,1,2,151.55,0,0,1


In [20]:
ndarray = x_OneHot_df.valuesr

In [21]:
ndarray.shape

(1309, 10)

In [23]:
ndarray[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ]])

In [24]:
Label = ndarray[:,0]
Features = ndarray[:,1:]

In [25]:
Label[:2]

array([1., 1.])

In [26]:
Features[:2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ]])

In [27]:
from sklearn import preprocessing

In [28]:
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0, 1))

In [29]:
scaledFeatures = minmax_scale.fit_transform(Features)

In [30]:
scaledFeatures[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [35]:
msk = np.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [36]:
print("total:", len(all_df),
     "train:", len(train_df),
     "test:", len(test_df))

total: 1309 train: 1063 test: 246


In [39]:
def PreprocessData(raw_df):
    df = raw_df.drop(["name"], axis = 1)
    age_mean = df["age"].mean()
    df["age"] = df["age"].fillna(age_mean)
    fare_mean = df["fare"].mean()
    df["fare"] = df["fare"].fillna(age_mean)
    df["sex"] = df["sex"].map({"female":0, "male":0}).astype(int)
    x_OneHot_df = pd.get_dummies(data = df, columns = ["embarked"])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range = (0, 1))
    scaledFeatures = minmax_scale.fit_transform(Features)
    
    return scaledFeatures, Label


In [41]:
train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)

In [42]:
train_Features[:2]

array([[0.        , 0.        , 0.35916204, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.00628273, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [43]:
train_Label[:2]

array([1., 1.])