In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from tensorflow import keras

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
nan_columns = train_df.columns[train_df.isnull().any()].tolist()
nan_columns

['Age', 'Cabin', 'Embarked']

In [6]:
for column in nan_columns:    
    print(f"% of null values in {column}: "
          f"{round(train_df[column].isnull().sum()/len(train_df[column])*100, 3)}")

% of null values in Age: 19.865
% of null values in Cabin: 77.104
% of null values in Embarked: 0.224


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns='Survived'), 
                                                    train_df['Survived'],
                                                    test_size=0.3,
                                                    random_state=1234)

In [8]:
from sklearn.model_selection import StratifiedKFold

seed=1234
kfold = StratifiedKFold(n_splits=5)

In [9]:
sex_dict = {'female': 1, 'male': 0}

In [10]:
X_train_filtered = X_train[['Pclass', 'Fare', 'Age', 'SibSp', 'Parch', 'Sex']]
X_train_filtered = X_train_filtered.replace(sex_dict)

In [11]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_filtered = imp.fit_transform(X_train_filtered)

In [12]:
X_test_filtered = X_test[['Pclass', 'Fare', 'Age', 'SibSp', 'Parch', 'Sex']]
X_test_filtered = X_test_filtered.replace(sex_dict)
X_test_filtered = imp.fit_transform(X_test_filtered)

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn import  metrics


models = []

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

param_grid = {
            'preprocessing': [StandardScaler(), None],
            'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)

grid_1.fit(X_train_filtered, y_train)
grid_1.best_params_

y_test_pred = cross_val_predict(grid_1.best_estimator_, X_test_filtered, y_test, cv=3)
print(f"R^2: {metrics.precision_score(y_test, y_test_pred)}") 
print(f"Recall_score: {metrics.recall_score(y_test, y_test_pred)}") 

R^2: 0.7976190476190477
Recall_score: 0.6568627450980392


In [17]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_filtered, y_train)
models.append(log_reg)

In [19]:
y_test_pred = cross_val_predict(log_reg, X_test_filtered, y_test, cv=3)
print(f"R^2: {metrics.precision_score(y_test, y_test_pred)}") 
print(f"Recall_score: {metrics.recall_score(y_test, y_test_pred)}") 

R^2: 0.813953488372093
Recall_score: 0.6862745098039216


In [20]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train_filtered, y_train)

Y_prediction = random_forest.predict(X_test_filtered)

print("R^2: {}".format(metrics.precision_score(y_test, random_forest.predict(X_test_filtered)) ))
print("recall_score: {}".format( metrics.recall_score(y_test, random_forest.predict(X_test_filtered)) ))

R^2: 0.7701149425287356
recall_score: 0.6568627450980392


In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=10), 
    n_estimators=1, learning_rate=0.5, 
    algorithm="SAMME.R", random_state=42)
ada_clf.fit(X_train_filtered, y_train)
print("R^2: {}".format(metrics.precision_score(y_test, ada_clf.predict(X_test_filtered)) ))
print("recall_score: {}".format( metrics.recall_score(y_test, ada_clf.predict(X_test_filtered)) ))

R^2: 0.7948717948717948
recall_score: 0.6078431372549019


In [28]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import History

history = History()
model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu", input_shape=(X_train_filtered.shape[1],)),
    keras.layers.Dense(200, activation="swish"),
    keras.layers.Dense(100, activation="elu"),
    keras.layers.Dense(50, activation="elu"),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])
history = model.fit(X_train_filtered, y_train, validation_data= (X_test_filtered, y_test), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Ostatecznie wybrałam model na sieci z kerasa.

In [29]:
test_df = pd.read_csv("test.csv")
test = test_df[['Pclass', 'Fare', 'Age', 'SibSp', 'Parch', 'Sex']]
test = test.replace(sex_dict)
test = imp.fit_transform(test)

In [30]:
y_prob = model.predict(test) 



In [31]:
y_classes = abs(y_prob.round()).astype('int').squeeze()

In [32]:
output = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_classes})

In [33]:
output.to_csv('results.csv', index=False)