## Load normalized data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from keras import models
from keras import layers
import numpy as np

normalized_data = pd.read_csv("Output/normalized_data.csv", index_col=0)
normalized_data

Using TensorFlow backend.


FileNotFoundError: [Errno 2] File b'Output/normalized_data.csv' does not exist: b'Output/normalized_data.csv'

## Counting empty values

In [None]:
empty_values = [(label, normalized_data[label].isnull().sum()) for label in normalized_data.columns.values]
sorted(empty_values, reverse=True, key=lambda tup: tup[1])

## Separate samples with and without age

In [None]:
samples_with_age = normalized_data[normalized_data.Age.notna()]
samples_without_age = normalized_data[normalized_data.Age.isna()]

## Creating model for age prediction

In [None]:
x_train_age = samples_with_age.drop(["Age", "Survived"], axis=1).values
y_train_age = samples_with_age["Age"].values

number_of_epochs = 80
number_of_folds = 5
number_of_samples = len(x_train_age) // number_of_folds

all_mae_histories = []
all_val_mae_histories = []
for i in range(number_of_folds):
    print("processing fold #", i)
    partial_x_train_age = np.concatenate([x_train_age[:i*number_of_samples], x_train_age[(i+1)*number_of_samples:]])
    parital_y_train_age = np.concatenate([y_train_age[:i*number_of_samples], y_train_age[(i+1)*number_of_samples:]])
    
    partial_x_validation_age = x_train_age[i*number_of_samples:(i+1)*number_of_samples]
    partial_y_validation_age = y_train_age[i*number_of_samples:(i+1)*number_of_samples]

    model = models.Sequential()
    model.add(layers.Dense(x_train_age.shape[1], activation="relu", input_shape=(x_train_age.shape[1],)))
    model.add(layers.Dense(12, activation="relu"))
    model.add(layers.Dense(1))

    model.compile(optimizer="rmsprop",
                  loss="mse",
                  metrics=["mae"])

    history = model.fit(partial_x_train_age,
                        parital_y_train_age,
                        epochs=number_of_epochs,
                        batch_size=16,
                        validation_data=(partial_x_validation_age, partial_y_validation_age))
    all_mae_histories.append(history.history['mae'])
    all_val_mae_histories.append(history.history['val_mae'])


In [None]:
average_mae_history = [np.mean([x[i] for x in all_mae_histories]) for i in range(number_of_epochs)]
average_val_mae_history = [np.mean([x[i] for x in all_val_mae_histories]) for i in range(number_of_epochs)]

epochs = range(1, number_of_epochs + 1)

plt.plot(epochs, average_mae_history, "b", label="Training MAE")
plt.plot(epochs, average_val_mae_history, "b", label="Validation MAE", c="red")
plt.title("Training and validation MAE")
plt.xlabel("Epochs")
plt.ylabel("MAE")
plt.legend()

plt.show()

## Evaluate results for age prediction

In [None]:
results_age = model.predict(samples_without_age.drop(["Survived", "Age"], axis=1))
samples_without_age.loc[:,"Age"] = results_age
samples_without_age

## Save data

In [None]:
normalized_data_with_predicted_age = pd.concat([samples_with_age, samples_without_age])
normalized_data_with_predicted_age.to_csv("Output/normalized_data_with_predicted_age.csv", index=True)
normalized_data_with_predicted_age

## Compare skewness of original and predicted Age

In [None]:
import matplotlib.pyplot as plt
scalar = 80

plt.hist(samples_with_age.Age*scalar, range(scalar))
plt.title("Original Age")
plt.show()

plt.hist(results_age*scalar, range(scalar))
plt.title("Predicted Age")
plt.show()