In [1]:
import numpy as np
import tensorflow as tf

In [2]:
#Use titanic data
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [3]:
#Load data into datasets, before then, we need the info on titanic datasets
import pandas as pd

df = pd.read_csv(train_file_path)

In [4]:
#get column info
columns = list(df.columns)

In [5]:
df.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [6]:
#Now we can load data directly into tensorflow datasets
ds_train = tf.data.experimental.make_csv_dataset(train_file_path, batch_size = 5, 
                                                 #We would set defaults for NULLs in columns
                                                 select_columns = columns,
                                                 column_defaults = [np.nan, 'NA', np.nan, np.nan, np.nan, np.nan, 'NA', 'NA', 'NA', 'NA'],
                                                label_name= 'survived',
                                                 na_value = 'unknown',
                                                 num_epochs = 1,
                                                 shuffle = False
                                                )
#Now we can load data directly into tensorflow datasets
ds_test = tf.data.experimental.make_csv_dataset(test_file_path, batch_size = 5, 
                                                 #We would set defaults for NULLs in columns
                                                 select_columns = columns,
                                                 column_defaults = [np.nan, 'NA', np.nan, np.nan, np.nan, np.nan, 'NA', 'NA', 'NA', 'NA'],
                                                label_name= 'survived',
                                                 na_value = 'unknown',
                                                 num_epochs = 1,
                                                shuffle = False
                                                )

In [11]:
#We next would like to count how many NULL values are there for each column in the dataset

In [12]:
numeric_cols = list(df.describe().columns)
categorical_cols = [x for x in df.columns if x not in numeric_cols ]

#numeric_cols should excluding target label
numeric_cols.remove('survived')

In [13]:
#for counting NULL values, let's start from one batch and try the concept
for i in ds_train.take(1):
    for col in numeric_cols:
        print(i[0][col].numpy().sum())

149.0
3.0
0.0
148.0166


In [14]:
#So we would define two functions of calculating # of NULL values for numerical and categorical columns
def numeric_nulls(track, batch):
    for col in numeric_cols:
        track[col] += tf.cast(tf.math.count_nonzero(tf.math.is_nan(batch[0][col])), tf.int32)
    return track

In [15]:
ds_train.reduce(dict.fromkeys(numeric_cols, tf.constant(0)), numeric_nulls)

{'age': <tf.Tensor: shape=(), dtype=int32, numpy=0>,
 'n_siblings_spouses': <tf.Tensor: shape=(), dtype=int32, numpy=0>,
 'parch': <tf.Tensor: shape=(), dtype=int32, numpy=0>,
 'fare': <tf.Tensor: shape=(), dtype=int32, numpy=0>}

In [16]:
#Let's double-check if our calculation is correct
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   survived            627 non-null    int64  
 1   sex                 627 non-null    object 
 2   age                 627 non-null    float64
 3   n_siblings_spouses  627 non-null    int64  
 4   parch               627 non-null    int64  
 5   fare                627 non-null    float64
 6   class               627 non-null    object 
 7   deck                627 non-null    object 
 8   embark_town         627 non-null    object 
 9   alone               627 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 49.1+ KB


In [17]:
df_test = pd.read_csv(test_file_path)

In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   survived            264 non-null    int64  
 1   sex                 264 non-null    object 
 2   age                 264 non-null    float64
 3   n_siblings_spouses  264 non-null    int64  
 4   parch               264 non-null    int64  
 5   fare                264 non-null    float64
 6   class               264 non-null    object 
 7   deck                264 non-null    object 
 8   embark_town         264 non-null    object 
 9   alone               264 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 20.8+ KB


In [19]:
#It does not seem that we need to check for NULLs anymore...
#Next we would create feature columns out of numeric and categorical columns

In [20]:
#First let's combine numeric columns into one called "numeric"
class PackNumericFeatures():
    def __init__(self, names):
        self.names = names
        
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feature, tf.float32) for feature in numeric_features]
        numeric_features = tf.stack(numeric_features, -1)
        features['numeric'] = numeric_features
        
        return features, labels

In [21]:
ds_train_packed = ds_train.map(PackNumericFeatures(numeric_cols))
ds_test_packed = ds_test.map(PackNumericFeatures(numeric_cols))

In [22]:
for i in ds_train_packed.take(1):
    features = i[0]
    for key, value in features.items():
        print(f'{key:20s}: {value}')

sex                 : [b'male' b'female' b'female' b'female' b'male']
class               : [b'Third' b'First' b'Third' b'First' b'Third']
deck                : [b'NA' b'C' b'NA' b'C' b'NA']
embark_town         : [b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Queenstown']
alone               : [b'n' b'n' b'y' b'n' b'y']
numeric             : [[22.      1.      0.      7.25  ]
 [38.      1.      0.     71.2833]
 [26.      0.      0.      7.925 ]
 [35.      1.      0.     53.1   ]
 [28.      0.      0.      8.4583]]


In [23]:
#So let's rewrite PackNumericFeatures twice to make sure we understand what is going on under the hood
class PackNumericFeatures(): 
    #First define a class, the reason that we would like to define a class is because we would like to pass in the list of numeric column names first
    #So we need have __init__ method to pass in the list of numeric column names
    def __init__(self, names):
        self.names = names
        
    #Now we need to specifically define a __call__ method so that it could be called like a function in dataset mapping
    def __call__(self,features, label):
        #Since for dataset, the map function could pass in multiple values (instead of one) based on number of elements in a batch,
        #in this case, each batch/tuple would contain two elements - features and label
        #Here we only need to pack the features together
        #First we would like to get the list of numeric feature tensors, we would use list comprehension to get a feature column values iteratively
        #At the same time, we would use .pop to remove those numeric features from the original dict
        numeric_features = [features.pop(name) for name in self.names]
        #Now next, because tensorflow usually works with float32, we transform every tensor in the numeric feature list into tf.float32
        numeric_features = [tf.cast(feature, tf.float32) for feature in numeric_features]
        #Finally, we would like to transform/stack the list of numeric features into one along the last axis - # of examples in a batch
        numeric_features = tf.stack(numeric_features, -1)
        #Then we would like to make sure we assign the newly created tensor back to the original features
        features['numeric'] = numeric_features
        
        #return values
        return features, label

In [24]:
#Now write PackNumericFeatures again to memorize it
class PackNumericFeatures():
    def __init__(self, numeric_cols):
        self.numeric_cols = numeric_cols
        
    def __call__(self, features, label):
        numeric_features = [features.pop(numeric_col) for numeric_col in numeric_cols]
        numeric_features = [tf.cast(feature, tf.float32) for feature in numeric_features]
        numeric_features = tf.stack(numeric_features, -1)
        features['numeric'] = numeric_features
        
        return features, label

In [25]:
#Next we would like to create a standardization function for a numeric tensor
def Standardize(tensor, mean, std):
    return (tensor - mean)/std

In [26]:
#To use the function above, first we actually need to get the mean and std, we have to get them from the original df(train)
info = df.describe()[numeric_cols].T
mean = info['mean']
std = info['std']

In [27]:
#Because we already know mean and std, we would like to crete a partial function to use these two metrics
import functools
normalizer = functools.partial(Standardize, mean = mean, std = std)

In [28]:
#write again
import functools
normalizer = functools.partial(Standardize, mean = mean, std = std)

In [29]:
#Also note the above could also be achieved by creating a class like above
#So finally, we can create our numeric feature columns
numeric_column = tf.feature_column.numeric_column('numeric', shape = [len(numeric_cols)], normalizer_fn = normalizer)
numeric_columns = [numeric_column]

In [30]:
#Now let's try to explain what is going on in the code above
#1. We would like to assign feature column, specifically, numeric column, we would use tf.feature_column.numeric_column()
#   each call here is actually for one column
#2. The parameters passed in are 'numeric' - which is the combined numeric column we have; 'shape = 4' because we have 4 numeric features;
#   normalizer_fn = normalizer is the standardization normalizer we have, actually it does not have to be a normalizer but any function that can
#   transform the raw numeric input
#3. We assign all the above to numeric_column variable, note again that this variable only represents one column, since all feature columns need to be a list,
#   we temporarily put up a list called numeric_columns contain only numeric_column so that later we can append numeric_columns to other feature column lists
numeric_column = tf.feature_column.numeric_column('numeric', shape = [len(numeric_cols)], normalizer_fn =  normalizer)
numeric_columns = [numeric_column]

In [31]:
#Now to create feature columns for categorical columns, we need to get the vocabulary list for each column
categories = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [32]:
#Even so, we need to create categorical columns one by one
categorical_columns = [tf.feature_column.categorical_column_with_vocabulary_list(key, value) for key, value in categories.items()]

In [33]:
#However, we still need to transform the categorical_columns using one-hot encoding or other encodings, default one-encoding here
categorical_columns = [tf.feature_column.indicator_column(categorical_column) for categorical_column in categorical_columns]

In [34]:
#Now we have all feature columns, we can create the DenseFeatures layer
feature_layer = tf.keras.layers.DenseFeatures(numeric_columns + categorical_columns)

In [35]:
#Now we can create a NN model to test
model = tf.keras.Sequential()
model.add(feature_layer)
model.add(tf.keras.layers.Dense(32, activation = 'relu')) #Note no input_shape needs to be specified
model.add(tf.keras.layers.Dense(32, activation = 'relu'))
model.add(tf.keras.layers.Dense(1))

model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])

In [36]:
#Only shuffle when necessary and need to create separate copy of the dataset because we are not able to "unshuffle"
training_set = ds_train_packed.unbatch().shuffle(1000).batch(5)
testing_set = ds_test_packed.unbatch().shuffle(1000).batch(5)

In [37]:
#Now we can fit and evaluate the model
model.fit(training_set, epochs = 20, validation_data = testing_set)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x23c26dd1448>

In [38]:
model.evaluate(testing_set)



[0.42213958501815796, 0.8446969985961914]

In [39]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [57]:
#Finally let's compare the predicted outcomes with actual outcomes
#Let's only take two batches, note here we do not use the shuffle dataset here because it would produce different results each time
test = ds_test_packed.take(2)

In [58]:
pred_prob = sigmoid(model.predict(test)).ravel()

In [59]:
import numpy as np

actual_outcome = []
for batch in test:
    actual_outcome = np.append(actual_outcome, batch[1].numpy())

In [60]:
for ac, pr in zip(actual_outcome, pred_prob):
    print(f'The actual outcome is: {"Survived" if ac == 1 else "Not Survived":20} and the predicted',
             f'survival probability is {pr:.2%}.')

The actual outcome is: Not Survived         and the predicted survival probability is 12.24%.
The actual outcome is: Not Survived         and the predicted survival probability is 28.60%.
The actual outcome is: Survived             and the predicted survival probability is 83.66%.
The actual outcome is: Survived             and the predicted survival probability is 85.80%.
The actual outcome is: Survived             and the predicted survival probability is 7.77%.
The actual outcome is: Survived             and the predicted survival probability is 89.13%.
The actual outcome is: Not Survived         and the predicted survival probability is 25.81%.
The actual outcome is: Not Survived         and the predicted survival probability is 7.87%.
The actual outcome is: Not Survived         and the predicted survival probability is 45.12%.
The actual outcome is: Survived             and the predicted survival probability is 87.69%.
