In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
train_df = train_df.drop(["PassengerId","Name","Ticket"], axis=1)
test_df = test_df.drop(["PassengerId","Name","Ticket"], axis=1)

In [4]:
print ("Train")
print (train_df.isnull().sum() )
print ("-------")
print ("Test")
print (test_df.isnull().sum() )

Train
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64
-------
Test
Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64


In [5]:
combined_df = pd.concat([train_df, test_df])

# get mean values per gender
male_mean_age = combined_df[combined_df["Sex"]=="male"]["Age"].mean()
female_mean_age = combined_df[combined_df["Sex"]=="female"]["Age"].mean()
print ("female mean age: %1.0f" %female_mean_age )
print ("male mean age: %1.0f" %male_mean_age )

# fill the nan values 
train_df.loc[ (train_df["Sex"]=="male") & (train_df["Age"].isnull()), "Age"] = male_mean_age
train_df.loc[ (train_df["Sex"]=="female") & (train_df["Age"].isnull()), "Age"] = female_mean_age

test_df.loc[ (test_df["Sex"]=="male") & (test_df["Age"].isnull()), "Age"] = male_mean_age
test_df.loc[ (test_df["Sex"]=="female") & (test_df["Age"].isnull()), "Age"] = female_mean_age

female mean age: 29
male mean age: 31


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [6]:
train_df["Cabin"] = train_df["Cabin"].fillna("X")
test_df["Cabin"] = test_df["Cabin"].fillna("X")

In [7]:
mean_fare = combined_df["Fare"].mean()
test_df["Fare"] = test_df["Fare"].fillna(mean_fare)

In [8]:
train_df["Embarked"] = train_df["Embarked"].fillna("S")
test_df["Embarked"] = test_df["Embarked"].fillna("S")

In [9]:
# sampling 80% for train data
train_set = train_df.sample(frac=0.8, replace=False, random_state=777)
# the other 20% is reserverd for cross validation
cv_set = train_df.loc[ set(train_df.index) - set(train_set.index)]

print ("train set shape (%i,%i)"  %train_set.shape)
print ("cv set shape (%i,%i)"   %cv_set.shape)
print ("Check if they have common indexes. The folowing line should be an empty set:")
print (set(train_set.index) & set(cv_set.index))

train set shape (713,9)
cv set shape (178,9)
Check if they have common indexes. The folowing line should be an empty set:
set()


In [17]:
# defining numeric columns
pclass_feature = tf.feature_column.numeric_column('Pclass')
parch_feature = tf.feature_column.numeric_column('Parch')
fare_feature = tf.feature_column.numeric_column('Fare')
age_feature = tf.feature_column.numeric_column('Age')

#defining buckets for children, teens, adults and elders.
age_bucket_feature = tf.feature_column.bucketized_column(age_feature,[5,12,18,25,35,60])

#defining a categorical column with predefined values
sex_feature = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex',['female','male']
)
#defining a categorical columns with dynamic values
embarked_feature =  tf.feature_column.categorical_column_with_hash_bucket(
    'Embarked', 3 
)
cabin_feature =  tf.feature_column.categorical_column_with_hash_bucket(
    'Cabin', 100 
)

feature_columns = [ pclass_feature,age_feature, age_bucket_feature, parch_feature, 
                   fare_feature, embarked_feature, cabin_feature ]

In [18]:
feature_columns

[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _BucketizedColumn(source_column=_NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(5, 12, 18, 25, 35, 60)),
 _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _HashedCategoricalColumn(key='Embarked', hash_bucket_size=3, dtype=tf.string),
 _HashedCategoricalColumn(key='Cabin', hash_bucket_size=100, dtype=tf.string)]

In [19]:
estimator = tf.estimator.LinearClassifier(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_keep_checkpoint_max': 5, '_session_config': None, '_model_dir': 'C:\\Users\\Simon\\AppData\\Local\\Temp\\tmpq7dpq4ur', '_global_id_in_cluster': 0, '_is_chief': True, '_evaluation_master': '', '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B6062EFA58>, '_train_distribute': None, '_save_checkpoints_steps': None, '_master': '', '_task_id': 0, '_tf_random_seed': None, '_task_type': 'worker', '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1}


In [20]:
# train input function
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=train_set.drop('Survived', axis=1),
      y=train_set.Survived,
      num_epochs=None, #For training it can use how many epochs is necessary
      shuffle=True,
      target_column='target',
)

cv_input_fn = tf.estimator.inputs.pandas_input_fn(
      x=cv_set.drop('Survived', axis=1),
      y=cv_set.Survived,
      num_epochs=1, #We just want to use one epoch since this is only to score.
      shuffle=False  #It isn't necessary to shuffle the cross validation 
)

In [21]:
estimator.train(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Simon\AppData\Local\Temp\tmpq7dpq4ur\model.ckpt.
INFO:tensorflow:step = 1, loss = 88.722855
INFO:tensorflow:global_step/sec: 138.484
INFO:tensorflow:step = 101, loss = 80.53661 (0.727 sec)
INFO:tensorflow:global_step/sec: 253.826
INFO:tensorflow:step = 201, loss = 69.61542 (0.392 sec)
INFO:tensorflow:global_step/sec: 284.835
INFO:tensorflow:step = 301, loss = 62.76243 (0.355 sec)
INFO:tensorflow:global_step/sec: 263.835
INFO:tensorflow:step = 401, loss = 68.088684 (0.373 sec)
INFO:tensorflow:global_step/sec: 240.128
INFO:tensorflow:step = 501, loss = 60.106525 (0.420 sec)
INFO:tensorflow:global_step/sec: 91.6847
INFO:tensorflow:step = 601, loss = 70.26138 (1.088 sec)
INFO:tensorflow:global_step/sec:

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1b6062ef710>

In [22]:
scores = estimator.evaluate(input_fn=cv_input_fn)
print("\nTest Accuracy: {0:f}\n".format(scores['accuracy']))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-27-01:56:39
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Simon\AppData\Local\Temp\tmpq7dpq4ur\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-27-01:56:41
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.6966292, accuracy_baseline = 0.6460674, auc = 0.71987575, auc_precision_recall = 0.6013047, average_loss = 0.6092997, global_step = 1000, label/mean = 0.3539326, loss = 54.227673, precision = 0.60465115, prediction/mean = 0.37136105, recall = 0.41269842

Test Accuracy: 0.696629



In [23]:
# DNN doesn't support categorical with hash bucket
embarked_embedding =  tf.feature_column.embedding_column(
    categorical_column = embarked_feature,
    dimension = 3,
)
cabin_embedding =  tf.feature_column.embedding_column(
    categorical_column = cabin_feature,
    dimension = 300,
)

# define the feature columns
feature_columns = [ pclass_feature,age_feature, age_bucket_feature, parch_feature, 
                   fare_feature, embarked_embedding, cabin_embedding ]

# instantiate the estimator
NNestimator = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[10, 30 , 10])

# call the train function using the train input function
NNestimator.train(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_keep_checkpoint_max': 5, '_session_config': None, '_model_dir': 'C:\\Users\\Simon\\AppData\\Local\\Temp\\tmp3pn_ly6o', '_global_id_in_cluster': 0, '_is_chief': True, '_evaluation_master': '', '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B6091F8AC8>, '_train_distribute': None, '_save_checkpoints_steps': None, '_master': '', '_task_id': 0, '_tf_random_seed': None, '_task_type': 'worker', '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Simon\App

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1b6091f8470>

In [24]:
accuracy_score = NNestimator.evaluate(input_fn=cv_input_fn)["accuracy"]
print("\nTest Accuracy: {0:f}\n".format(accuracy_score))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-27-01:59:47
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Simon\AppData\Local\Temp\tmp3pn_ly6o\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-27-01:59:48
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.71348315, accuracy_baseline = 0.6460674, auc = 0.7446515, auc_precision_recall = 0.6449059, average_loss = 0.73853123, global_step = 1000, label/mean = 0.3539326, loss = 65.72928, precision = 0.64285713, prediction/mean = 0.39738476, recall = 0.42857143

Test Accuracy: 0.713483



In [25]:
def prepare_datasets(df):
    df_copy = df[['Pclass', 'Parch',  'Sex', 'Embarked', "Age"]].copy()
    df_copy.loc[:,"Sex"] = df_copy.Sex.apply(lambda x: 0 if x =="male" else 1)

    e_map = {"C": 0,"Q":1, "S":2}
    df_copy.loc[:,"Embarked"] = df_copy.Embarked.apply(lambda x: e_map[x])

    df_copy.loc[:,"Age"]= df_copy.Age.astype(np.float32)

    x = df_copy[['Pclass', 'Parch', 'Age']].astype(np.float32)
#     y = train_set.Survived.astype(np.int32)
    y = df.Survived.astype(np.bool)
    return x, y

x_train, y_train = prepare_datasets(train_set)
x_cv, y_cv = prepare_datasets(cv_set)

In [26]:
def generate_tf_input_fn(x_input,y_input,num_epochs=None):
    #this is the function we are generating
    def _input_fn_():
        # generate a standard input function
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            x= x_input,  
            y= y_input,
            num_epochs=num_epochs,
            shuffle=True,
            target_column='target',
        )
        #execute the standard input function 
        x, y = train_input_fn()
        # expand the shape of the results (necessary for Tensor Forest)
        for name in x:
            x[name] = tf.expand_dims(x[name], 1, name= name) 
        return x, y
    
    return _input_fn_