# Basic imports

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import numpy as np
import tensorflow as tf

In [2]:
tf.__version__

'2.2.0-dev20200319'

In [3]:
np.__version__

'1.16.2'

# Data URLs

In [5]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [7]:
train_file_path, test_file_path

('/Users/z003z7n/.keras/datasets/train.csv',
 '/Users/z003z7n/.keras/datasets/eval.csv')

In [8]:
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

# Loading Data

In [13]:
#Using Pandas

import pandas as pd
dataset = pd.read_csv(train_file_path)

In [20]:
dataset.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


# Handling Huge .CSV data

In [21]:
# In case to process huge data as batches

tf.data.experimental.make_csv_dataset(file_pattern = train_file_path, batch_size = 100)

In [35]:
# survived is the target variable
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [37]:
# defining the function to get the data from csv passing the location
def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [38]:
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [39]:
raw_test_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [40]:
def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

In [41]:
# getting to see the data
show_batch(raw_train_data)

sex                 : [b'female' b'male' b'male' b'male' b'male']
age                 : [21. 28. 23. 19. 16.]
n_siblings_spouses  : [2 0 0 0 4]
parch               : [2 0 0 0 1]
fare                : [262.375   7.896   7.854   7.775  39.688]
class               : [b'First' b'Third' b'Third' b'Third' b'Third']
deck                : [b'B' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'y' b'y' b'n']


In [44]:
show_batch(raw_test_data)

sex                 : [b'female' b'female' b'male' b'male' b'female']
age                 : [28.  28.  23.5 28.  10. ]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 0 0 0 2]
fare                : [ 8.137  7.75   7.229 27.721 24.15 ]
class               : [b'Third' b'Third' b'Third' b'First' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Queenstown' b'Queenstown' b'Cherbourg' b'Cherbourg' b'Southampton']
alone               : [b'y' b'y' b'y' b'y' b'n']


In [45]:
# Another way - when we don't have column names in the file we are working
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

sex                 : [b'male' b'female' b'female' b'female' b'male']
age                 : [23. 51. 28. 40. 29.]
n_siblings_spouses  : [0 1 0 1 0]
parch               : [0 0 0 1 0]
fare                : [10.5   77.958  7.787 39.     9.483]
class               : [b'Second' b'First' b'Third' b'Second' b'Third']
deck                : [b'unknown' b'D' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Queenstown' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'y' b'n' b'y']


# Data Preprocessing

# Continuous data

In [46]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                           select_columns=SELECT_COLUMNS,
                           column_defaults = DEFAULTS)

show_batch(temp_dataset)

age                 : [31. 26. 39. 40.  2.]
n_siblings_spouses  : [1. 0. 0. 0. 4.]
parch               : [0. 0. 0. 0. 1.]
fare                : [52.    78.85  24.15  27.721 39.688]
