# <font color="black"> Titanic Dataset </font>

By: Tony Zheng

## Set Constants to be used
<table>
    <colgroup>
        <col span="1" style="background-color: #D6EEEE; ">
    </colgroup>
    <tr> <td>poly</td> <td>The order of the polynomial basis to be used</td> </tr>
    <tr> <td>standardize_cols</td> <td>The columns whose values will be standardized</td> </tr>
    <tr> <td>onehot_cols</td> <td>The columns who will have their values converted to one-hot format</td> </tr>
    <tr> <td>replace_na</td> <td>The columns that will replace NA values with 0, and non-NA values with 1</td> </tr>
</table>

In [111]:
poly = 2
standardize_cols = ['Age', 'SibSp', 'Fare']
onehot_cols = ['Sex', 'Embarked']
replace_na = ['Cabin']

## Importing the data

In [112]:
import pandas as pd
import numpy as np

test = pd.read_csv("test.csv")
test_survived = pd.read_csv("gender_submission.csv")

train = pd.read_csv("train.csv")

print("Data loaded")

Data loaded


# Format Data

#### Standardize some numeric columns (Age, SibSp, Fare)

In [113]:

for column in standardize_cols:
    mean = train[column].mean()
    std = train[column].std()
    
    train[column] = train[column].fillna(mean)
    train[column] = (train[column] - mean) / std

print(train[standardize_cols])

          Age     SibSp      Fare
0   -0.530005  0.432550 -0.502163
1    0.571430  0.432550  0.786404
2   -0.254646 -0.474279 -0.488580
3    0.364911  0.432550  0.420494
4    0.364911 -0.474279 -0.486064
..        ...       ...       ...
886 -0.185807 -0.474279 -0.386454
887 -0.736524 -0.474279 -0.044356
888  0.000000  0.432550 -0.176164
889 -0.254646 -0.474279 -0.044356
890  0.158392 -0.474279 -0.492101

[891 rows x 3 columns]


#### One-hot gender, embarked

In [114]:
for column in onehot_cols:
    to_hot = pd.get_dummies(train[column])
    train = train.drop(column, axis=1)
    train = train.join(to_hot)

    print(train[to_hot.columns])

     female   male
0     False   True
1      True  False
2      True  False
3      True  False
4     False   True
..      ...    ...
886   False   True
887    True  False
888    True  False
889   False   True
890   False   True

[891 rows x 2 columns]
         C      Q      S
0    False  False   True
1     True  False  False
2    False  False   True
3    False  False   True
4    False  False   True
..     ...    ...    ...
886  False  False   True
887  False  False   True
888  False  False   True
889   True  False  False
890  False   True  False

[891 rows x 3 columns]


#### Replace columns with significant NA data with 1s indicating it is present, and 0s indicating otherwise

In [115]:
for column in replace_na:
    train[column][train[column].isna()] = 1
    train[column][~train[column].isna()] = 0

print(train[replace_na])

    Cabin
0       0
1       0
2       0
3       0
4       0
..    ...
886     0
887     0
888     0
889     0
890     0

[891 rows x 1 columns]


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train[column][train[column].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[column][train[col

Do the same for the test data

In [116]:
for column in ['Age', 'SibSp', 'Fare']:
    mean = test[column].mean()
    std = test[column].std()
    
    test[column] = test[column].fillna(mean)
    test[column] = (test[column] - mean) / std

for column in ['Sex', 'Embarked']:
    to_hot = pd.get_dummies(test[column])
    test = test.drop(column, axis=1)
    test = test.join(to_hot)

for column in ['Cabin']:
    test[column][train[column].isna()] = 1
    test[column][~train[column].isna()] = 0

print(test)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

          Age     SibSp  Parch              Ticket      Fare Cabin  female  \
0    0.298099 -0.498872      0   

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  test[column][train[column].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[column][train[colum

Select features

In [117]:
train_on = ['Age', 
            'male', 'female', 
            'Pclass', 
            'Fare', 
            'Cabin', 
            'C', 'Q', 'S']

X_train = train[train_on].astype('float').to_numpy()
Y_train = np.array([train['Survived'].astype('int').to_numpy()]).T

print(X_train)
print(Y_train)

X_test = test[train_on].astype('float').to_numpy()
Y_test = np.array([test_survived['Survived'].astype('int').to_numpy()]).T

print(X_test)
print(Y_test)

[[-0.5300051   1.          0.         ...  0.          0.
   1.        ]
 [ 0.57143041  0.          1.         ...  1.          0.
   0.        ]
 [-0.25464622  0.          1.         ...  0.          0.
   1.        ]
 ...
 [ 0.          0.          1.         ...  0.          0.
   1.        ]
 [-0.25464622  1.          0.         ...  1.          0.
   0.        ]
 [ 0.1583921   1.          0.         ...  0.          1.
   0.        ]]
[[0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 

Transform into polynomial basis

In [118]:
from sklearn.preprocessing import PolynomialFeatures
poly_basis = PolynomialFeatures(degree=poly)
X_train = poly_basis.fit_transform(X_train)
X_test = poly_basis.fit_transform(X_test)

# Testing various models

In [121]:
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import VarianceScaling

m = ()
for scalingFactor in [0.01, 1, 10, 25, 50, 75, 100]:
    initializers = [
        VarianceScaling(scale=scalingFactor, mode='fan_in',distribution='untruncated_normal'),
        VarianceScaling(scale=scalingFactor, mode='fan_in',distribution='untruncated_normal'),
        VarianceScaling(scale=scalingFactor, mode='fan_in',distribution='untruncated_normal'),
        VarianceScaling(scale=scalingFactor, mode='fan_in',distribution='untruncated_normal')
    ]
    
    layers = [Dense(200, activation='relu', kernel_initializer=initializers[0]), 
              Dense(50, activation='sigmoid', kernel_initializer=initializers[1]), 
              Dense(20, activation='relu', kernel_initializer=initializers[2]), 
              Dense(1, activation='sigmoid', kernel_initializer=initializers[3])
              ]

    model = Sequential()

    for layer in layers: model.add(layer)
    model.compile(loss=BinaryCrossentropy())
    model.fit(X_train, Y_train, validation_split=.5, epochs=100, shuffle=True, callbacks=[EarlyStopping(min_delta=0.01, patience=20)], verbose=True)

    Y_test_hat = np.round(\
        model.predict(X_train, verbose=0)
    )
    acc = np.sum(Y_test_hat == Y_train) / Y_train.shape[0]
    
    if len(m) == 0: m = (model, acc)
    elif acc > m[1]: m = (model, acc)

Y_test_hat = np.round(\
        m[0].predict(X_test, verbose=0)
)
    

Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.6913 - val_loss: 0.6849
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.6859 - val_loss: 0.6777
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6839 - val_loss: 0.6716
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6736 - val_loss: 0.6663
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6691 - val_loss: 0.6598
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6622 - val_loss: 0.6514
Epoch 7/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6631 - val_loss: 0.6420
Epoch 8/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6392 - val_loss: 0.6332
Epoch 9/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━

Put out to CSV

In [None]:
pred = pd.DataFrame(Y_test_hat, columns=["Survived"]).astype('int')
new_indices = pd.Index([i for i in range(train.shape[0] + 1, train.shape[0] + test.shape[0] + 1)])
pred = pred.set_index(new_indices)
pred.to_csv("submission.csv", index_label="PassengerID")