In [47]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=100, type=int, help='batch size')
parser.add_argument('--train_steps', default=10000, type=int,
                    help='number of training steps')


_StoreAction(option_strings=['--train_steps'], dest='train_steps', nargs=None, const=None, default=10000, type=<class 'int'>, choices=None, help='number of training steps', metavar=None)

In [48]:
train_data = pd.read_csv("./train.csv")
validate = pd.read_csv('./test.csv')

In [49]:
validate_data = pd.read_csv('./test.csv')

In [50]:
def remove_outliers(df=None,col_name=None):
    std =  df[col_name].std()
    m = df[col_name].mean()
    df['is_outlier'] = df[col_name].apply(lambda x : True if x < m - 2*std or x > m + 2*std else False)
    tmp_df = df[df['is_outlier']==False]
    return tmp_df.drop('is_outlier',axis=1)

In [51]:
def marriage_status(x):
    return str(str(x.split(',')[1]).split('.')[0]).strip().replace(" ",'_')

In [52]:
def normalize_series(series,max_x,min_x):
    return series.apply(lambda x: (x-min_x)/(max_x-min_x))

In [53]:
def data_cleaning(df):
    embarked_dummy = pd.get_dummies(df['Embarked'])
    sex_dummy = pd.get_dummies(df['Sex'])
    df['p_class']=df.Pclass.apply(lambda x : map_pclass(x))
    pclass_dummy = pd.get_dummies(df.p_class)
    df = remove_outliers(df,'Fare')
    df['n_fare'] = normalize_series(df.Fare,df.Fare.max(),df.Fare.min())
    df['m_s'] = df.Name.apply(lambda x: marriage_status(x))
    m_s_dummy = pd.get_dummies(df.m_s)
    df  = pd.concat([df,embarked_dummy,sex_dummy,pclass_dummy,m_s_dummy], axis=1)
    df = df.drop(['PassengerId','Name','Ticket','Cabin','Pclass','p_class','Sex','Embarked','Fare','m_s'],axis=1)
    df = df.fillna(0)

    return df

In [54]:
def map_pclass(x):
    if x == 1:
        return 'first_class'
    elif x == 2:
        return 'Second_class'
    elif x == 3:
        return 'third_class'
    else:
        return 'Unknown'


In [55]:
train_data = data_cleaning(train_data)

In [56]:
train_data['Dona'] = 0

In [57]:
train, test = train_test_split(train_data, test_size=0.2)

In [58]:
validate_data = data_cleaning(validate_data)

In [59]:
validate_data['Capt'] = 0
validate_data['Don'] = 0
validate_data['Jonkheer'] = 0
validate_data['Lady'] = 0
validate_data['Major'] = 0
validate_data['Mlle'] = 0
validate_data['Mme'] = 0
validate_data['Sir'] = 0
validate_data['the_Countess'] = 0 

In [60]:
test_x, test_y = test, test.pop('Survived')

In [61]:
train_x, train_y = train, train.pop('Survived')

In [62]:
def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    return dataset.shuffle(1000).repeat().batch(batch_size)

In [63]:
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

In [64]:
my_feature_columns = []
for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [65]:
my_feature_columns

[_NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SibSp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='n_fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='C', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Q', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='S', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='female', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='male', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Second_class', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _Nume

In [66]:
HIDDEN_LAYER = [120,60,26]
DROPOUT=0.11

In [67]:
grid_search = pd.DataFrame(columns=['hidden_layer','dropout','accuracy'])

In [None]:
for i in range(20):
    classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        # Two hidden layers of 10 nodes each.
        hidden_units=HIDDEN_LAYER,
        # The model must choose between 3 classes.
        n_classes=2,
        dropout=DROPOUT)
    
    classifier.train(
        input_fn=lambda:train_input_fn(train_x,train_y,batch_size=1000), steps=1000)
    
    eval_result = classifier.evaluate(
        input_fn=lambda:eval_input_fn(test_x, test_y, 1000))

    print('\nTest set accuracy: {}\n'.format(eval_result['accuracy']))

    grid_search = grid_search.append({'hidden_layer':str(HIDDEN_LAYER).replace(',','_').replace(' ','').replace(']','').replace('[','')
                    ,'dropout':DROPOUT
                    ,'accuracy': eval_result['accuracy']},ignore_index=True)
    DROPOUT += 0.01
    HIDDEN_LAYER[0] = HIDDEN_LAYER[0]+20
    HIDDEN_LAYER[1] = HIDDEN_LAYER[1]+10
    HIDDEN_LAYER[2] = HIDDEN_LAYER[2]+1

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_log_step_count_steps': 100, '_service': None, '_model_dir': '/tmp/tmp3r_uv5o5', '_master': '', '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff12e938048>, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0, '_is_chief': True, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_tf_random_seed': None, '_session_config': None, '_save_checkpoints_steps': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp3r_uv5o5/model.ckpt.
INFO:tensorflow:loss = 1173.6335, step = 1
INFO:tensorflow:global_step/sec: 46.2781
INFO:tensorflow:loss = 380.66083, step = 101 (2.162 sec)
INFO:tensorflow:global_step/sec: 48.7399
INFO:tensorflow:loss = 352.86304, step = 201 (2.052 sec)
INFO:tensorflow:global_step/sec: 47.4445
INFO:tensorflow:loss =

INFO:tensorflow:global_step/sec: 42.5999
INFO:tensorflow:loss = 312.9291, step = 501 (2.348 sec)
INFO:tensorflow:global_step/sec: 42.9222
INFO:tensorflow:loss = 306.7074, step = 601 (2.330 sec)
INFO:tensorflow:global_step/sec: 43.4423
INFO:tensorflow:loss = 299.82742, step = 701 (2.302 sec)
INFO:tensorflow:global_step/sec: 43.0232
INFO:tensorflow:loss = 283.94116, step = 801 (2.324 sec)
INFO:tensorflow:global_step/sec: 42.4438
INFO:tensorflow:loss = 302.76227, step = 901 (2.356 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmppbybgwe8/model.ckpt.
INFO:tensorflow:Loss for final step: 282.86792.
INFO:tensorflow:Starting evaluation at 2018-04-11-08:08:18
INFO:tensorflow:Restoring parameters from /tmp/tmppbybgwe8/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-04-11-08:08:19
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.849162, accuracy_baseline = 0.61452514, auc = 0.8760212, auc_precision_recall = 0.8722719, average_loss = 0.6054771, global_step =

INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_log_step_count_steps': 100, '_service': None, '_model_dir': '/tmp/tmpgoxmx3g1', '_master': '', '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff12e702588>, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0, '_is_chief': True, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_tf_random_seed': None, '_session_config': None, '_save_checkpoints_steps': None}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpgoxmx3g1/model.ckpt.
INFO:tensorflow:loss = 957.8871, step = 1
INFO:tensorflow:global_step/sec: 36.5771
INFO:tensorflow:loss = 377.11414, step = 101 (2.735 sec)
INFO:tensorflow:global_step/sec: 37.4796
INFO:tensorflow:loss = 331.8958, step = 201 (2.668 sec)
INFO:tensorflow:global_step/sec: 36.3222
INFO:tensorflow:loss = 321.7287, step = 301 (2.753 sec)
INFO:t

INFO:tensorflow:loss = 312.43896, step = 601 (2.952 sec)
INFO:tensorflow:global_step/sec: 33.8768
INFO:tensorflow:loss = 311.06055, step = 701 (2.952 sec)
INFO:tensorflow:global_step/sec: 33.9402
INFO:tensorflow:loss = 276.69537, step = 801 (2.946 sec)
INFO:tensorflow:global_step/sec: 33.8992
INFO:tensorflow:loss = 286.58258, step = 901 (2.950 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmp1t7as1ja/model.ckpt.
INFO:tensorflow:Loss for final step: 268.91302.
INFO:tensorflow:Starting evaluation at 2018-04-11-08:11:49
INFO:tensorflow:Restoring parameters from /tmp/tmp1t7as1ja/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-04-11-08:11:50
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8435754, accuracy_baseline = 0.61452514, auc = 0.8602108, auc_precision_recall = 0.85351956, average_loss = 0.68651885, global_step = 1000, label/mean = 0.38547486, loss = 122.88688, prediction/mean = 0.36478668

Test set accuracy: 0.8435754179954529

INFO:tensorflow

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp4gnpllcs/model.ckpt.
INFO:tensorflow:loss = 1047.2786, step = 1
INFO:tensorflow:global_step/sec: 29.228
INFO:tensorflow:loss = 393.83075, step = 101 (3.423 sec)
INFO:tensorflow:global_step/sec: 29.9094
INFO:tensorflow:loss = 353.1153, step = 201 (3.343 sec)
INFO:tensorflow:global_step/sec: 29.8163
INFO:tensorflow:loss = 336.8794, step = 301 (3.354 sec)
INFO:tensorflow:global_step/sec: 29.9762
INFO:tensorflow:loss = 334.34116, step = 401 (3.336 sec)
INFO:tensorflow:global_step/sec: 29.7644
INFO:tensorflow:loss = 349.9062, step = 501 (3.360 sec)
INFO:tensorflow:global_step/sec: 29.7042
INFO:tensorflow:loss = 303.6389, step = 601 (3.367 sec)
INFO:tensorflow:global_step/sec: 29.8795
INFO:tensorflow:loss = 345.15875, step = 701 (3.347 sec)
INFO:tensorflow:global_step/sec: 28.8724
INFO:tensorflow:loss = 318.88153, step = 801 (3.464 sec)
INFO:tensorflow:global_step/sec: 29.4263
INFO:tensorflow:lo

INFO:tensorflow:Restoring parameters from /tmp/tmpfn0twx_k/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-04-11-08:16:09
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.849162, accuracy_baseline = 0.61452514, auc = 0.8334651, auc_precision_recall = 0.83180875, average_loss = 0.80463105, global_step = 1000, label/mean = 0.38547486, loss = 144.02896, prediction/mean = 0.3568298

Test set accuracy: 0.8491619825363159

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_log_step_count_steps': 100, '_service': None, '_model_dir': '/tmp/tmpbaxynbw1', '_master': '', '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff1244bdbe0>, '_task_type': 'worker', '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0, '_is_chief': True, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_tf_random_seed': None, '_session_config': None, '_save_

In [147]:
grid_search

Unnamed: 0,hidden_layer,dropout,accuracy
0,100_50_25,0.1,0.804469
1,120_60_26,0.11,0.821229
2,140_70_27,0.12,0.815642
3,160_80_28,0.13,0.804469
4,180_90_29,0.14,0.798883
5,200_100_30,0.15,0.815642
6,220_110_31,0.16,0.787709
7,240_120_32,0.17,0.787709
8,260_130_33,0.18,0.804469
9,280_140_34,0.19,0.810056


In [149]:
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=HIDDEN_LAYER,
    # The model must choose between 3 classes.
    n_classes=2,
    dropout=DROPOUT)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_master': '', '_save_checkpoints_secs': 600, '_task_type': 'worker', '_num_ps_replicas': 0, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_service': None, '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_is_chief': True, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpls5mkcod', '_task_id': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9c80fb1518>}


In [155]:
classifier.train(
    input_fn=lambda:train_input_fn(train_x,train_y,batch_size=10000), steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tmpls5mkcod/model.ckpt-10000
INFO:tensorflow:Saving checkpoints for 10001 into /tmp/tmpls5mkcod/model.ckpt.
INFO:tensorflow:step = 10001, loss = 2265.2368
INFO:tensorflow:global_step/sec: 8.89445
INFO:tensorflow:step = 10101, loss = 2628.4646 (11.244 sec)
INFO:tensorflow:global_step/sec: 8.89235
INFO:tensorflow:step = 10201, loss = 2422.9053 (11.246 sec)
INFO:tensorflow:global_step/sec: 8.99531
INFO:tensorflow:step = 10301, loss = 2492.442 (11.117 sec)


KeyboardInterrupt: 

In [151]:
eval_result = classifier.evaluate(
    input_fn=lambda:eval_input_fn(test_x, test_y, 1000))

print('\nTest set accuracy: {}\n'.format(eval_result['accuracy']))

grid_search = grid_search.append({'hidden_layer':str(HIDDEN_LAYER).replace(',','_').replace(' ','').replace(']','').replace('[','')
                    ,'dropout':DROPOUT
                    ,'accuracy': eval_result['accuracy']},ignore_index=True)

INFO:tensorflow:Starting evaluation at 2018-04-10-09:56:33
INFO:tensorflow:Restoring parameters from /tmp/tmpls5mkcod/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2018-04-10-09:56:34
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.7932961, accuracy_baseline = 0.61452514, auc = 0.8262187, auc_precision_recall = 0.81394875, average_loss = 1.8543111, global_step = 10000, label/mean = 0.38547486, loss = 331.9217, prediction/mean = 0.47577202

Test set accuracy: 0.7932960987091064



In [152]:
predictions = classifier.predict(
    input_fn=lambda:eval_input_fn(validate_data,None,batch_size=1000))
result_df = pd.DataFrame(columns=['Survived'])
for predic in predictions:
    row = {'Survived': predic['class_ids'][0]}
    result_df = result_df.append(row,ignore_index=True)

INFO:tensorflow:Restoring parameters from /tmp/tmpls5mkcod/model.ckpt-10000


In [153]:
df = pd.concat([validate.PassengerId,result_df], axis=1)

In [154]:
df.to_csv('./{}_{}_restul.csv'.format(str(HIDDEN_LAYER).replace(',','_').replace(' ','').replace(']','').replace('[',''),
                                      int(DROPOUT*10),index=False))

In [474]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier


clf = RandomForestClassifier(n_estimators=26, max_depth=4, min_samples_split=4, random_state=0,)
clf.fit(train_x,train_y)
scores = cross_val_score(clf, test_x, test_y)
scores.mean()                             

0.8383810317680838

In [475]:
train_x.columns

Index(['Age', 'SibSp', 'Parch', 'n_fare', 'C', 'Q', 'S', 'female', 'male',
       'Second_class', 'first_class', 'third_class', 'Capt', 'Col', 'Don',
       'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme',
       'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the_Countess', 'Dona'],
      dtype='object')

In [476]:
validate_data.columns

Index(['Age', 'SibSp', 'Parch', 'n_fare', 'C', 'Q', 'S', 'female', 'male',
       'Second_class', 'first_class', 'third_class', 'Col', 'Dona', 'Dr',
       'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Rev', 'Capt', 'Don', 'Jonkheer',
       'Lady', 'Major', 'Mlle', 'Mme', 'Sir', 'the_Countess'],
      dtype='object')

In [477]:
result_df = pd.DataFrame(columns=['Survived'])
predictions = clf.predict(validate_data)
for predic in predictions:
    row = {'Survived': predic}
    result_df = result_df.append(row,ignore_index=True)
df = pd.concat([validate.PassengerId,result_df], axis=1)

In [478]:
df.to_csv('rf.csv',index=False)

In [276]:
train_x.columns

Index(['Age', 'SibSp', 'Parch', 'n_fare', 'C', 'Q', 'S', 'female', 'male',
       'Second_class', 'first_class', 'third_class'],
      dtype='object')

In [281]:
validate_data.columns

Index(['Age', 'SibSp', 'Parch', 'n_fare', 'C', 'Q', 'S', 'female', 'male',
       'Second_class', 'first_class', 'third_class'],
      dtype='object')