In [1]:
import math
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)

In [2]:
# Training and test data files
train_file = '../input/train.csv'
test_file = '../input/test.csv'
model_file_male = '../output/titanic.model.male.json'
model_weights_file_male = '../output/titanic.model.male.best.hdf5'
model_file_female = '../output/titanic.model.female.json'
model_weights_file_female = '../output/titanic.model.female.best.hdf5'
pred_file = '../output/gender_submission.csv'

In [3]:
# Prepare the data for training and testing
from sklearn.preprocessing import MinMaxScaler

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev', 'Dr', 'Ms', 'Mlle',
            'Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']

import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if substring in big_string:
            return substring
    return np.nan

def prep_data(frame, augmentation=0):
    # Fill missing Age data with median 
    frame['Age'] = frame['Age'].fillna(frame['Age'].mean())
    
    # Generate data about whether adult or minor
    frame['Adult_Or_Minor'] = frame.apply(lambda row: 0 if row['Age'] < 18 else 1, axis=1)

    # Generate data about whether senior citizen
    frame['Senior_Citizen'] = frame.apply(lambda row: 0 if row['Age'] > 65 else 1, axis=1)

    # Fill missing Fare data with median
    frame['Fare'] = frame['Fare'].fillna(frame['Fare'].median())
    
    # Creating new family_size and fare per person columns 
    frame['Family_Size'] = frame['SibSp'] + frame['Parch'] + 1
    frame['Alone'] = frame.apply(lambda row: 1 if row['Family_Size'] == 1 else 0, axis=1)
    frame['Fare_Per_Person'] = frame['Fare']/frame['Family_Size']

    # Convert Sex to number
    #frame['Sex'] = pd.Categorical(frame['Sex']).codes
    frame.pop('Sex')

    # Generate data for missing Embarked and convert to number
    frame['Embarked'] = frame['Embarked'].fillna('X')
    frame['Embarked'] = pd.Categorical(frame['Embarked']).codes
    
    # Extract title from name
    frame['Title'] = frame['Name'].map(lambda x: substrings_in_string(x, title_list))
    frame['Title'] = pd.Categorical(frame['Title']).codes

    # Convert Name into characters
    frame['Name_Length'] = frame.apply(lambda row: len(row['Name']), axis=1)
    frame['Words_In_Name'] = frame.apply(lambda row: len(row['Name'].split()), axis=1)    
    frame.pop('Name')    
    
    # Convert Ticket into characters
    frame['Ticket_Length'] = frame.apply(lambda row: len(row['Ticket']), axis=1)
    frame.pop('Ticket')    
    
    # Convert Cabin column to whether in cabin
    frame['Cabin'] = frame['Cabin'].fillna('')
    frame['In_Cabin'] = frame.apply(lambda row: 1 if row['Cabin'] != '' else 0, axis=1)
    frame['Number_Of_Cabins'] = frame.apply(lambda row: len(row['Cabin'].split()), axis=1)    
    frame.pop('Cabin')
    
    frame.fillna(0, axis=1)
    
    # Introduce rows with some noise
    if augmentation > 0:
        print('Adding more rows to training data')
        row_count = frame.shape[0]
        print('Row count before: ', row_count)
        col_std = np.std(frame) 
        for i in range(0, row_count):
            rand = np.random.random_sample()
            if rand < augmentation:
                row1 = pd.Series(frame.iloc[i])
                row2 = pd.Series(frame.iloc[i])
                col_list = frame.columns.tolist()
                col_list.remove('PassengerId')
                col_list.remove('Survived')
                for col in col_list:
                    row1[col] = row1[col] + rand * col_std[col]
                    row2[col] = row2[col] - rand * col_std[col]
                frame = frame.append(row1)
                frame = frame.append(row2)
        row_count = frame.shape[0]
        print('Row count after: ', row_count)
    
    
    print("Before scaling: ")
    print(frame.head())
    
    # Scale everything except PassengerId
    min_max_scaler = MinMaxScaler()
    col_list = frame.columns.tolist()
    col_list.remove('PassengerId')
    frame = frame[col_list]
    np_scaled = min_max_scaler.fit_transform(frame)
    frame = pd.DataFrame(np_scaled)
    
    print("After scaling: ")
    print(frame.head())

    return frame



In [4]:
# Load training data
df_train_raw = pd.read_csv(train_file)
print(df_train_raw.shape)
df_train_raw.info()
df_train_raw.head()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Prep training data
df_train_male_raw = df_train_raw[df_train_raw.Sex == 'male']
df_train_female_raw = df_train_raw[df_train_raw.Sex == 'female']
df_train_male = prep_data(df_train_male_raw, augmentation=0.05)
df_train_female = prep_data(df_train_female_raw, augmentation=0.015)

Adding more rows to training data
Row count before:  577
Row count after:  651
Before scaling: 
   PassengerId  Survived  Pclass        Age  SibSp  Parch     Fare  Embarked  \
0          1.0       0.0     3.0  22.000000    1.0    0.0   7.2500       2.0   
4          5.0       0.0     3.0  35.000000    0.0    0.0   8.0500       2.0   
5          6.0       0.0     3.0  30.726645    0.0    0.0   8.4583       1.0   
6          7.0       0.0     1.0  54.000000    0.0    0.0  51.8625       2.0   
7          8.0       0.0     3.0   2.000000    3.0    1.0  21.0750       2.0   

   Adult_Or_Minor  Senior_Citizen  Family_Size  Alone  Fare_Per_Person  Title  \
0             1.0             1.0          2.0    0.0           3.6250    7.0   
4             1.0             1.0          1.0    1.0           8.0500    7.0   
5             1.0             1.0          1.0    1.0           8.4583    7.0   
6             1.0             1.0          1.0    1.0          51.8625    7.0   
7             0.0 

In [6]:
# Construct the X array for males
X_train_male = np.array(df_train_male)[:,1:]
X_train_male = X_train_male.astype('float32')
print(X_train_male.shape)
print(X_train_male[0])

# Construct the X array for females
X_train_female = np.array(df_train_female)[:,1:]
X_train_female = X_train_female.astype('float32')
print(X_train_female.shape)
print(X_train_female[0])

(651, 17)
[0.9951812  0.27117366 0.1263802  0.00145553 0.01415106 1.
 0.99643046 0.9986089  0.10158069 0.         0.00707553 0.875
 0.2972973  0.2516391  0.4        0.00460892 0.00194779]
(318, 17)
[0.         0.59839356 0.12589778 0.         0.12764232 0.
 1.         0.         0.1        0.00347583 0.06750725 0.8333333
 0.530303   0.36363637 0.30769232 0.9967229  0.25      ]


In [7]:
# Extract survived data as predictions
from keras.utils.np_utils import to_categorical

y_train_male = np.array(df_train_male)[:,0]
y_train_male = y_train_male.astype('int')
y_train_male = to_categorical(y_train_male, 2)
print(y_train_male.shape)
print(y_train_male[0:5])

y_train_female = np.array(df_train_female)[:,0]
y_train_female = y_train_female.astype('int')
y_train_female = to_categorical(y_train_female, 2)
print(y_train_female.shape)
print(y_train_female[0:5])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


(651, 2)
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
(318, 2)
[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [8]:
# Load test data
df_test_raw = pd.read_csv(test_file)
print(df_test_raw.shape)
df_test_raw.head()
df_test_raw.info()

(418, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# Prepare the data for testing
df_test_male_raw = df_test_raw[df_test_raw.Sex == 'male']
df_test_female_raw = df_test_raw[df_test_raw.Sex == 'female']
df_test_male = prep_data(df_test_male_raw)
df_test_female = prep_data(df_test_female_raw)

Before scaling: 
   PassengerId  Pclass   Age  SibSp  Parch     Fare  Embarked  Adult_Or_Minor  \
0          892       3  34.5      0      0   7.8292         1               1   
2          894       2  62.0      0      0   9.6875         1               1   
3          895       3  27.0      0      0   8.6625         2               1   
5          897       3  14.0      0      0   9.2250         2               0   
7          899       2  26.0      1      1  29.0000         2               1   

   Senior_Citizen  Family_Size  Alone  Fare_Per_Person  Title  Name_Length  \
0               1            1      1         7.829200      3           16   
2               1            1      1         9.687500      3           25   
3               1            1      1         8.662500      3           16   
5               1            1      1         9.225000      3           26   
7               1            3      0         9.666667      3           28   

   Words_In_Name  Ticket_Le

In [10]:
# Construct the X array for males
X_test_male = np.array(df_test_male)[:,:]
X_test_male = X_test_male.astype('float32')
print(X_test_male.shape)
print(X_test_male[0])

# Construct the X array for females
X_test_female = np.array(df_test_female)[:,:]
X_test_female = X_test_female.astype('float32')
print(X_test_female.shape)
print(X_test_female[0])

(266, 17)
[1.         0.51252437 0.         0.         0.02983973 0.5
 1.         1.         0.         1.         0.03701749 0.75
 0.075      0.         0.2        0.         0.        ]
(152, 17)
[1.0000000e+00 6.1756563e-01 1.2500000e-01 0.0000000e+00 9.8935612e-05
 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e-01 0.0000000e+00
 9.1450913e-03 6.6666669e-01 3.5416666e-01 4.0000001e-01 1.4285715e-01
 0.0000000e+00 0.0000000e+00]


In [11]:
# Build a training network

from keras.models import Sequential
from keras.layers import Dense, Dropout, RepeatVector, Flatten, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import SGD
from keras.layers.advanced_activations import LeakyReLU

def build_model(input_shape):
    model = Sequential()
    model.add(Dense(891, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.25))
    model.add(Dense(445, activation='relu'))
    model.add(Dropout(0.5))
#    model.add(Dense(222, activation='relu'))
#    model.add(Dropout(0.75))
    model.add(Dense(2, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()
    
    return model

In [12]:
# Save the model
def save_model(model, model_file):
    model_json = model.to_json()
    with open(model_file, 'w') as json_file:
        json_file.write(model_json)

In [13]:
# Train the model
def train_model(model, model_weights_file, X_train, y_train):
    checkpointer = ModelCheckpoint(filepath=model_weights_file, verbose=1, save_best_only=True)
    stopper = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=20, verbose=1, mode='auto')
    hist = model.fit(X_train, y_train, epochs=200, batch_size=20, validation_split=0.3,
                     callbacks=[checkpointer, stopper], 
                     verbose=1, shuffle=True)

In [14]:
# Build and train model for males
model_male = build_model(input_shape=(X_train_male.shape[1],))
save_model(model_male, model_file_male)
train_model(model_male, model_weights_file_male, X_train_male, y_train_male)

# Load the weights that yielded the best validation accuracy
model_male.load_weights(model_weights_file_male)

# Evaluate the model on the training set
score_male = model_male.evaluate(X_train_male, y_train_male)
print("\nTraining Accuracy:", score_male[1])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 891)               16038     
_________________________________________________________________
dropout_1 (Dropout)          (None, 891)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 445)               396940    
_________________________________________________________________
dropout_2 (Dropout)          (None, 445)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 892       
Total params: 413,870
Trainable params: 413,870
Non-trainable params: 0
_________________________________________________________________
Train on 455 samples, validate on 196 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.37271, saving model to ../output


Epoch 00037: val_loss did not improve
Epoch 38/200

Epoch 00038: val_loss did not improve
Epoch 39/200

Epoch 00039: val_loss did not improve
Epoch 40/200

Epoch 00040: val_loss did not improve
Epoch 41/200

Epoch 00041: val_loss improved from 0.29250 to 0.28179, saving model to ../output/titanic.model.male.best.hdf5
Epoch 42/200

Epoch 00042: val_loss did not improve
Epoch 43/200

Epoch 00043: val_loss did not improve
Epoch 44/200

Epoch 00044: val_loss did not improve
Epoch 45/200

Epoch 00045: val_loss did not improve
Epoch 46/200

Epoch 00046: val_loss did not improve
Epoch 47/200

Epoch 00047: val_loss did not improve
Epoch 48/200

Epoch 00048: val_loss did not improve
Epoch 49/200

Epoch 00049: val_loss did not improve
Epoch 50/200

Epoch 00050: val_loss did not improve
Epoch 51/200

Epoch 00051: val_loss did not improve
Epoch 52/200

Epoch 00052: val_loss did not improve
Epoch 53/200

Epoch 00053: val_loss did not improve
Epoch 54/200

Epoch 00054: val_loss improved from 0.2817

In [15]:
# Build and train model for males
model_female = build_model(input_shape=(X_train_female.shape[1],))
save_model(model_female, model_file_female)
train_model(model_female, model_weights_file_female, X_train_female, y_train_female)

# Load the weights that yielded the best validation accuracy
model_female.load_weights(model_weights_file_female)

# Evaluate the model on the training set
score_female = model_female.evaluate(X_train_female, y_train_female)
print("\nTraining Accuracy:", score_female[1])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 891)               16038     
_________________________________________________________________
dropout_3 (Dropout)          (None, 891)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 445)               396940    
_________________________________________________________________
dropout_4 (Dropout)          (None, 445)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 892       
Total params: 413,870
Trainable params: 413,870
Non-trainable params: 0
_________________________________________________________________
Train on 222 samples, validate on 96 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.57888, saving model to ../output/


Epoch 00035: val_loss did not improve
Epoch 36/200

Epoch 00036: val_loss did not improve
Epoch 37/200

Epoch 00037: val_loss did not improve
Epoch 38/200

Epoch 00038: val_loss did not improve
Epoch 39/200

Epoch 00039: val_loss did not improve
Epoch 40/200

Epoch 00040: val_loss did not improve
Epoch 41/200

Epoch 00041: val_loss did not improve
Epoch 42/200

Epoch 00042: val_loss improved from 0.37564 to 0.37206, saving model to ../output/titanic.model.female.best.hdf5
Epoch 43/200

Epoch 00043: val_loss did not improve
Epoch 44/200

Epoch 00044: val_loss did not improve
Epoch 45/200

Epoch 00045: val_loss did not improve
Epoch 46/200

Epoch 00046: val_loss did not improve
Epoch 47/200

Epoch 00047: val_loss did not improve
Epoch 48/200

Epoch 00048: val_loss did not improve
Epoch 49/200

Epoch 00049: val_loss did not improve
Epoch 50/200

Epoch 00050: val_loss did not improve
Epoch 51/200

Epoch 00051: val_loss did not improve
Epoch 52/200

Epoch 00052: val_loss did not improve
Ep

In [16]:
# Predict for test data
y_test_male = model_male.predict(X_test_male)
print(y_test_male[0])

y_test_female = model_female.predict(X_test_female)
print(y_test_female[0])

[0.9525753 0.0463152]
[0.5603798  0.43398172]


In [17]:
# Save predictions
with open(pred_file, 'w') as f:
    f.write('PassengerId,Survived\n')
    for index, y_hat in enumerate(y_test_male):
        prediction = np.argmax(y_hat)
        f.write(str(int(df_test_male_raw.iloc[index]['PassengerId'])) + ',' + str(prediction)+'\n')
    for index, y_hat in enumerate(y_test_female):
        prediction = np.argmax(y_hat)
        f.write(str(int(df_test_female_raw.iloc[index]['PassengerId'])) + ',' + str(prediction)+'\n')
    f.close()