In [None]:
#!pip install --upgrade tensorflow
!pip show tensorflow

Name: tensorflow
Version: 2.1.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: astor, wheel, keras-applications, absl-py, numpy, six, termcolor, gast, protobuf, tensorboard, wrapt, google-pasta, scipy, keras-preprocessing, grpcio, tensorflow-estimator, opt-einsum
Required-by: stable-baselines, magenta, fancyimpute


# New Section

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


### Load data ###

In [None]:
## Load data ##
import io
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv']))
test_data = pd.read_csv(io.BytesIO(uploaded['test.csv']))

In [None]:
### Data preparation ###
train_y = train_data["Survived"]
train_X = train_data.copy()
test_X = test_data.copy()
train_X.drop(["Survived"],axis=1, inplace=True)
train_X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
print('*** train_X is null ***')
train_data.isnull().sum()

*** train_X is null ***


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:

print('*** test_X is null ***')
test_data.isnull().sum()

*** test_X is null ***


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### Features preparation ###

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
## PassengerId ##
train_X.drop(["PassengerId"],axis=1, inplace=True)
test_X.drop(["PassengerId"],axis=1, inplace=True)

In [None]:
## Cabin ##
def _get_cabin_prefix(cabin):
    return cabin[0]

def create_cabin_features(df):
    df['Cabin'].fillna('-', inplace = True)
    df["CabinNotNull"] = df["Cabin"].map(lambda x: int(x != '-'))
    df["Cabin_Prefix"] = df["Cabin"].map(lambda x: _get_cabin_prefix(x))
    df.drop(["Cabin"],axis=1, inplace=True)
    
#create_cabin_features(train_X)
#create_cabin_features(test_X)

#train_X['Cabin_Prefix'] = le.fit_transform(train_X['Cabin_Prefix'])
#test_X['Cabin_Prefix'] = le.fit_transform(test_X['Cabin_Prefix'])

train_X.drop(['Cabin'],axis=1, inplace=True)
test_X.drop(['Cabin'],axis=1, inplace=True)

In [None]:
## Parch ##
#train_X.drop(['Parch'],axis=1, inplace=True)
#test_X.drop(['Parch'],axis=1, inplace=True)

In [None]:
## Names ##

def _get_title(name):
    return name.split(',')[1].split('.')[0].strip()

def _get_first_name(name):
    return name.split(',')[0].strip()

def _get_name_len(name):
    return len(name)

def create_name_features(df):
    df["Title"] = df["Name"].map(lambda x: _get_title(x))
    df["Name_1"] = df["Name"].map(lambda x: _get_first_name(x))
    df["Name_3_is"] = df["Name"].map(lambda x: '(' in x)
    df["Name_len"] = df["Name"].map(lambda x: _get_name_len(x))
    df.drop(["Name"],axis=1, inplace=True)
    
create_name_features(train_X)
create_name_features(test_X)

train_X['Title'] = le.fit_transform(train_X['Title'])
test_X['Title'] = le.fit_transform(test_X['Title'])

train_X['Name_1'] = le.fit_transform(train_X['Name_1'])
test_X['Name_1'] = le.fit_transform(test_X['Name_1'])

train_X['Name_3_is'] = le.fit_transform(train_X['Name_3_is'])
test_X['Name_3_is'] = le.fit_transform(test_X['Name_3_is'])

In [None]:
## Sex ##
print("train is_null=", train_X.Sex.isnull().sum())
print("test  is_null=", test_X.Sex.isnull().sum())

train_X['Sex'] = le.fit_transform(train_X['Sex'])
test_X['Sex'] = le.fit_transform(test_X['Sex'])

train is_null= 0
test  is_null= 0


In [None]:
## Age ##
print("train is_null=", train_X.Age.isnull().sum())
print("test  is_null=", test_X.Age.isnull().sum())
    
train_X["AgeNotNull"] = le.fit_transform(train_X.Age.notnull())
test_X["AgeNotNull"] = le.fit_transform(test_X.Age.notnull())

def _fill_age(df):
    avg_age_class_title = df.groupby(["Pclass", "Title"]).Age.apply(lambda x: x.mode())
    avg_age_class = df.groupby(["Pclass"]).Age.apply(lambda x: x.mode())
    counter = 0
    for idx in range(len(df)):
        row = df.loc[idx, :]
        if pd.isna(row['Age']):
            counter += 1
            if row['Title'] in avg_age_class_title[row['Pclass']]:
                df.set_value(idx, 'Age', avg_age_class_title[row['Pclass']][row['Title']][0])
            else:
                df.set_value(idx, 'Age', avg_age_class[row['Pclass']][0])
    #print('counter:', counter)
    
_fill_age(train_X)
_fill_age(test_X)

train is_null= 177
test  is_null= 86


  app.launch_new_instance()


In [None]:
## Ticket ##
def _get_ticket_prefix(ticket):
    # Returns ticket Prefix if found or str 'No'
    ticket.strip()
    if ' ' in ticket:
        return ticket.split(' ')[0].strip()
    return 'No'
    
def _get_ticket_number(ticket):
    # Returns ticket number
    ticket.strip()
    if ' ' in ticket:
        return ticket.split(' ')[1].strip()
    return ticket
    
def _get_ticket_flag(ticket):
    # Returns flag 1 if prefix found or 0 if not
    ticket.strip()
    return ' ' in ticket

def create_ticket_features(df):
    #df["Ticket_Prefix"] = df["Ticket"].map(lambda x: _get_ticket_prefix(x))
    df["Ticket_Number"] = df["Ticket"].map(lambda x: _get_ticket_number(x))
    df["Ticket_Flag"] = df["Ticket"].map(lambda x: _get_ticket_flag(x))
    df.drop(["Ticket"],axis=1, inplace=True)

create_ticket_features(train_X)
create_ticket_features(test_X)

#train_X['Ticket_Prefix'] = le.fit_transform(train_X['Ticket_Prefix'])
#test_X['Ticket_Prefix'] = le.fit_transform(test_X['Ticket_Prefix'])

train_X['Ticket_Number'] = le.fit_transform(train_X['Ticket_Number'])
test_X['Ticket_Number'] = le.fit_transform(test_X['Ticket_Number'])

In [None]:
## Embarked ##
train_X.Embarked.mode()
train_X['Embarked'].fillna('S', inplace = True)
test_X['Embarked'].fillna('S', inplace = True)

train_X['Embarked'] = le.fit_transform(train_X['Embarked'])
test_X['Embarked'] = le.fit_transform(test_X['Embarked'])

In [None]:
## Fare ##
print("train is_null=", train_X.Fare.isnull().sum())
print("test  is_null=", test_X.Fare.isnull().sum())

test_X.Fare.isnull().sum()
test_X['Fare'].fillna(float(test_X.Fare.mode()), inplace = True)

#train_X.drop(["Fare"],axis=1, inplace=True)
#test_X.drop(["Fare"],axis=1, inplace=True)

train is_null= 0
test  is_null= 1


In [None]:

train_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Name_1,Name_3_is,Name_len,AgeNotNull,Ticket_Number,Ticket_Flag
0,3,1,22.0,1,0,7.25,2,11,73,0,23,1,162,True
1,1,0,38.0,1,0,71.2833,0,12,136,1,51,1,126,True
2,3,0,26.0,0,0,7.925,2,8,251,0,22,1,343,True
3,1,0,35.0,1,0,53.1,2,12,198,1,44,1,51,False
4,3,1,35.0,0,0,8.05,2,11,11,0,24,1,595,False


### Feature scaling ###

In [None]:
from sklearn.preprocessing import StandardScaler
standardScalerX = StandardScaler()

#train_X.head()
from sklearn.utils.class_weight import compute_class_weight
def _compute_class_weight_dictionary(y):
    # helper for returning a dictionary instead of an array
    classes = np.unique(y)
    class_weight = compute_class_weight("balanced", classes, y)
    class_weight_dict = dict(zip(classes, class_weight))
    return class_weight_dict
cw = _compute_class_weight_dictionary(train_y)

train_X = pd.DataFrame(standardScalerX.fit_transform(train_X),columns = train_X.columns)
test_X = pd.DataFrame(standardScalerX.fit_transform(test_X),columns = test_X.columns)

cw

{0: 0.8114754098360656, 1: 1.3026315789473684}

### Model ###

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import optimizers
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
## Convert dataframe to numpy array
X=train_X.to_numpy()
Y=train_y.to_numpy()
print('X shape:', X.shape)
print('Y shape:', Y.shape)

X shape: (891, 14)
Y shape: (891,)


In [None]:

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Cross validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cvscores = []

for idx_train, idx_test in kfold.split(X, Y):
  # create keras model
  model = Sequential()
  model.add(Dense(10, input_dim=14, activation='relu'))
  model.add(Dense(9, activation='relu'))
  model.add(Dense(8, activation='relu'))
  model.add(Dense(7, activation='relu'))
  model.add(Dense(7, activation='relu'))
  model.add(Dense(6, activation='relu'))
  model.add(Dense(5, activation='relu'))
  model.add(Dense(4, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

	# compile the keras model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

	# fit the keras model on the dataset
  model.fit(X[idx_train], Y[idx_train], epochs=50, batch_size=10, verbose=0)

	# evaluate the model
  #scores = model.evaluate(X[idx_test], Y[idx_test], verbose=0)
  #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  #cvscores.append(scores[1] * 100)
  _, accuracy = model.evaluate(X[idx_test], Y[idx_test], verbose=0)
  print('Accuracy: %.2f' % (accuracy*100))
  cvscores.append(accuracy * 100)
print("Cross val accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
preds_val = model.predict_classes(X)
mae = mean_absolute_error(Y, preds_val)
print('mae:', mae)


Accuracy: 74.86
Accuracy: 82.02
Accuracy: 76.97
Accuracy: 80.34
Accuracy: 82.58
Cross val accuracy: 79.35% (+/- 2.98%)
mae: 0.11447811447811448


In [None]:
def get_mae(layers, nodes, epochs, train_X, val_X, train_y, val_y):
  # create keras model
  model = Sequential()
  model.add(Dense(nodes, input_dim=14, activation='relu'))
  for i in range(1, layers - 1):
    model.add(Dense(nodes, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  # compile the keras model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # fit the keras model on the dataset
  model.fit(train_X, train_y, epochs=epochs, batch_size=10, verbose=0)

  # predict
  preds_val = model.predict_classes(val_X)
  mae = mean_absolute_error(val_y, preds_val)
  return (mae)

def get_best_layers_count(nlayers, nodes, epochs, train_X, val_X, train_y, val_y):
  maes = []
  for layer in nlayers:
    print('Trying', layer, 'layer,', nodes, 'nodes')
    maes.append(get_mae(layer, nodes, epochs, train_X, val_X, train_y, val_y))
  min_mae = min(maes)
  
  idx_min_mae = maes.index(min_mae)
  best_layers_count = nlayers[idx_min_mae]
  return best_layers_count, min_mae

### split Data to train and validation sets ###
tr_X, val_X, tr_y, val_y = train_test_split(train_X, train_y, random_state=1)

nlayers = [x for x in range(15)]# + [30, 40, 50, 60, 70]
nnodes = [x for x in range(1, 15)]
epochs = [10, 50, 100, 200, 300]
result_str_all = []
for epoch in epochs:
  for nodes in nnodes:
    best_layers_count, min_mae = get_best_layers_count(nlayers, nodes, epoch, tr_X, val_X, tr_y, val_y)
    result_str = "{} epoch \t {} layers \t {} nodes \t mae={}".format(epoch, best_layers_count, nodes, round(min_mae, 5))
    print(result_str)
    result_str_all.append(result_str)

for x in result_str_all:
  print(x)


In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(X, Y)
print('Model accuracy: %.2f' % (accuracy*100))

In [None]:
# make class predictions with the model
predictions = model.predict_classes(X)
# Creating the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y, predictions)
#plt.figure(figsize=(5,2))
#sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
cm

array([[505,  44],
       [ 58, 284]])

In [None]:
predictions = model.predict_classes(test_X)
predictions = [x[0] for x in predictions]
submission = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':predictions})
submission.to_csv("submission.csv", index=False)
submission.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,1
