In [1]:
import tensorflow as tf
import pandas as pd
import keras
import numpy as np


In [2]:
df = pd.read_csv('data/train.csv')
print(df.shape)

(891, 12)


In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:

def fix_type(df):
    df[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']] = df[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']].astype(str) 
    return df




In [5]:
df = fix_type(pd.read_csv('data/train.csv'))
# val_df = fix_type(pd.read_csv('data/test.csv'))

#if nan, make nan avg in column.
print(df.isnull().sum())

df['Age'].fillna((df['Age'].mean()), inplace=True)

print(df.isnull().sum())

#not sex, sibsp, Pclass
df['SibSp'] = df['SibSp'].astype('float32')
df['Parch'] = df['Parch'].astype('float32')
# df = df.drop(columns=['Parch'])
# df = df.drop(columns=['Age'])
# df = df.drop(columns=['Name'])
df = df.drop(columns=['PassengerId'])
val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
Using 713 samples for training and 178 for validation


In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [7]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Survived")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [8]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'Pclass': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'Name': <tf.Tensor: shape=(), dtype=string, numpy=b'Collyer, Mrs. Harvey (Charlotte Annie Tate)'>, 'Sex': <tf.Tensor: shape=(), dtype=string, numpy=b'female'>, 'Age': <tf.Tensor: shape=(), dtype=float64, numpy=31.0>, 'SibSp': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'Parch': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'Ticket': <tf.Tensor: shape=(), dtype=string, numpy=b'C.A. 31921'>, 'Fare': <tf.Tensor: shape=(), dtype=float64, numpy=26.25>, 'Cabin': <tf.Tensor: shape=(), dtype=string, numpy=b'nan'>, 'Embarked': <tf.Tensor: shape=(), dtype=string, numpy=b'S'>}
Target: tf.Tensor(1, shape=(), dtype=int64)


In [9]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [10]:
df.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp       float32
Parch       float32
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [11]:
from keras.utils import FeatureSpace

feature_space = FeatureSpace(
    features={
        #'PassengerId': 'float',
        'Pclass': 'integer_categorical',
        'Name': 'string_hashed',
        'Sex': 'string_categorical',
        'Age': 'float',
        'SibSp': 'float',
        'Parch': 'float',
        'Ticket': 'string_hashed',
        'Fare': 'float_normalized',
        'Cabin': 'string_hashed',
        'Embarked': 'string_categorical',






        # # Categorical features encoded as integers
        # "Passenger": "integer_categorical",
        # "": "integer_categorical",
        # "fbs": "integer_categorical",
        # "restecg": "integer_categorical",
        # "exang": "integer_categorical",
        # "ca": "integer_categorical",
        # # Categorical feature encoded as string
        # "thal": "string_categorical",
        # # Numerical features to discretize
        # "age": "float_discretized",
        # # Numerical features to normalize
        # "trestbps": "float_normalized",
        # "chol": "float_normalized",
        # "thalach": "float_normalized",
        # "oldpeak": "float_normalized",
        # "slope": "float_normalized",
    },
    # We create additional features by hashing
    # value co-occurrences for the
    # following groups of categorical features.
    # crosses=[("sex", "age"), ("thal", "ca")],
    # # The hashing space for these co-occurrences
    # # wil be 32-dimensional.
    # crossing_dim=32,
    # Our utility will one-hot encode all categorical
    # features and concat all features into a single
    # vector (one vector per sample).
    output_mode="concat",
)

In [12]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)

In [13]:
for x, _ in train_ds.take(1):
    print(x)
    
    preprocessed_x = feature_space(x)
    print("preprocessed_x.shape:", preprocessed_x.shape)
    print("preprocessed_x.dtype:", preprocessed_x.dtype)

{'Pclass': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([2, 3, 2, 1, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 3, 3, 1, 3, 3, 3, 2,
       3, 1, 3, 3, 3, 2, 3, 2, 2, 3])>, 'Name': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Quick, Mrs. Frederick Charles (Jane Richards)',
       b'Daly, Mr. Eugene Patrick', b'Richard, Mr. Emile',
       b'Graham, Miss. Margaret Edith',
       b'Pickard, Mr. Berk (Berk Trembisky)', b'Perreault, Miss. Anne',
       b'Smith, Miss. Marion Elsie', b'Peters, Miss. Katie',
       b'Hassab, Mr. Hammad', b'Davidson, Mr. Thornton',
       b'Fortune, Mr. Charles Alexander', b'Connolly, Miss. Kate',
       b'Barkworth, Mr. Algernon Henry Wilson', b'Sdycoff, Mr. Todor',
       b'Abelson, Mrs. Samuel (Hannah Wizosky)',
       b'Goodwin, Master. Sidney Leonard', b'Wiklund, Mr. Jakob Alfred',
       b'Blackwell, Mr. Stephen Weart', b'Balkic, Mr. Cerin',
       b'Lester, Mr. James', b'Landergren, Miss. Aurora Adelia',
       b'Faunthorpe, Mrs. Lizzie (Elizabeth 

In [14]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)

preprocessed_val_ds = val_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)


In [25]:
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

x = keras.layers.Dense(64, activation="relu")(encoded_features)
x = keras.layers.Dropout(0.5)(x)
predictions = keras.layers.Dense(1, activation="sigmoid")(x)

training_model = keras.Model(inputs=encoded_features, outputs=predictions)
training_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)

inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)

In [26]:
training_model.fit(
    preprocessed_train_ds, epochs=50, validation_data=preprocessed_val_ds, verbose=2
)

Epoch 1/50
23/23 - 0s - loss: 1.5693 - accuracy: 0.4586 - val_loss: 0.7563 - val_accuracy: 0.6236 - 391ms/epoch - 17ms/step
Epoch 2/50
23/23 - 0s - loss: 1.1867 - accuracy: 0.5708 - val_loss: 0.6458 - val_accuracy: 0.6404 - 160ms/epoch - 7ms/step
Epoch 3/50
23/23 - 0s - loss: 1.0372 - accuracy: 0.5680 - val_loss: 0.6056 - val_accuracy: 0.6798 - 108ms/epoch - 5ms/step
Epoch 4/50
23/23 - 0s - loss: 0.9312 - accuracy: 0.5512 - val_loss: 0.6068 - val_accuracy: 0.6742 - 96ms/epoch - 4ms/step
Epoch 5/50
23/23 - 0s - loss: 0.8971 - accuracy: 0.5961 - val_loss: 0.5939 - val_accuracy: 0.6798 - 118ms/epoch - 5ms/step
Epoch 6/50
23/23 - 0s - loss: 0.8138 - accuracy: 0.6185 - val_loss: 0.5695 - val_accuracy: 0.7022 - 105ms/epoch - 5ms/step
Epoch 7/50
23/23 - 0s - loss: 0.7812 - accuracy: 0.6297 - val_loss: 0.5509 - val_accuracy: 0.7303 - 89ms/epoch - 4ms/step
Epoch 8/50
23/23 - 0s - loss: 0.7016 - accuracy: 0.6564 - val_loss: 0.5339 - val_accuracy: 0.7472 - 89ms/epoch - 4ms/step
Epoch 9/50
23/23 -

<keras.callbacks.History at 0xffff2c317f40>