### Titanic data

- 파일 업로드

In [273]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [274]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


### 1. 데이터 전처리

- feature selection

In [275]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked', 'deck']]
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,deck
0,0,3,male,22.0,1,0,S,
1,1,1,female,38.0,1,0,C,C
2,1,3,female,26.0,0,0,S,


- 결측치 처리

In [276]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
embarked      2
deck        688
dtype: int64

In [277]:
# age는 age의 평균으로 대체
df.age.fillna(df.age.mean(), inplace=True)

In [278]:
# embarked 컬럼은 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [279]:
df.embarked.fillna('S', inplace=True)

In [280]:
# deck 컬럼은 삭제
df.drop(columns=['deck'], inplace=True)

In [281]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S


- 카테고리 값을 숫자로 변환하기

In [282]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.sex = le.fit_transform(df.sex)               # 0: female, 1: male
df.embarked = le.fit_transform(df.embarked)     # 0: C, 1: Q, 2: S
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
0,0,3,1,22.0,1,0,2
1,1,1,0,38.0,1,0,0
2,1,3,0,26.0,0,0,2


In [283]:
df.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
embarked    0
dtype: int64

### 2. Train/Test dataset 분리

In [284]:
X = df.iloc[:,1:].values
y = df.survived.values

In [285]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
   X, y, stratify=y, test_size=0.2, random_state=2022
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [286]:
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)
X_scaled.shape

(891, 6)

### 3. 모델 정의/설정

In [287]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [215]:
# 모델저장
import os
if not os.path.exists('model'):
    os.mkdir('model')

In [216]:
model_path = 'model/best_titanic.h5'

In [291]:
# 모델 학습 / 저장
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True)

- model1: 은닉층 2개

In [289]:
model1 = Sequential([ 
    Dense(100, input_dim=6, activation='relu'),
    Dense(80, activation='relu'),
    Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential_48"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_240 (Dense)           (None, 100)               700       
                                                                 
 dense_241 (Dense)           (None, 80)                8080      
                                                                 
 dense_242 (Dense)           (None, 1)                 81        
                                                                 
Total params: 8,861
Trainable params: 8,861
Non-trainable params: 0
_________________________________________________________________


In [290]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [292]:
history = model1.fit(X_train, y_train, validation_split=0.2, verbose=0,
                     epochs=100, batch_size=100,
                     callbacks=[checkpoint])
model1.evaluate(X_test, y_test)


Epoch 00001: val_loss improved from inf to 0.67782, saving model to model/titanic_two.h5

Epoch 00002: val_loss did not improve from 0.67782

Epoch 00003: val_loss improved from 0.67782 to 0.63400, saving model to model/titanic_two.h5

Epoch 00004: val_loss improved from 0.63400 to 0.62302, saving model to model/titanic_two.h5

Epoch 00005: val_loss did not improve from 0.62302

Epoch 00006: val_loss improved from 0.62302 to 0.60610, saving model to model/titanic_two.h5

Epoch 00007: val_loss did not improve from 0.60610

Epoch 00008: val_loss improved from 0.60610 to 0.58731, saving model to model/titanic_two.h5

Epoch 00009: val_loss did not improve from 0.58731

Epoch 00010: val_loss improved from 0.58731 to 0.58236, saving model to model/titanic_two.h5

Epoch 00011: val_loss improved from 0.58236 to 0.56395, saving model to model/titanic_two.h5

Epoch 00012: val_loss improved from 0.56395 to 0.56295, saving model to model/titanic_two.h5

Epoch 00013: val_loss improved from 0.56295

[0.4295872151851654, 0.832402229309082]

- model2: 은닉층 4개

In [297]:
model2 = Sequential([ 
    Dense(120, input_dim=6, activation='relu'),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(80, activation='relu'),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_49"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_243 (Dense)           (None, 120)               840       
                                                                 
 dense_244 (Dense)           (None, 100)               12100     
                                                                 
 dense_245 (Dense)           (None, 100)               10100     
                                                                 
 dense_246 (Dense)           (None, 80)                8080      
                                                                 
 dense_247 (Dense)           (None, 1)                 81        
                                                                 
Total params: 31,201
Trainable params: 31,201
Non-trainable params: 0
_________________________________________________________________


In [298]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history2 = model2.fit(X_train, y_train, validation_split=0.2, verbose=0,
                    epochs=100, batch_size=100)

In [299]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience=20)

In [300]:
history2 = model2.fit(X_train, y_train, validation_split=0.2, verbose=2,
                 epochs = 200, batch_size=200, 
                 callbacks = [checkpoint, early_stopping])

Epoch 1/200

Epoch 00001: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3589 - accuracy: 0.8506 - val_loss: 0.8590 - val_accuracy: 0.8042 - 67ms/epoch - 22ms/step
Epoch 2/200

Epoch 00002: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3564 - accuracy: 0.8506 - val_loss: 0.8641 - val_accuracy: 0.7972 - 31ms/epoch - 10ms/step
Epoch 3/200

Epoch 00003: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3590 - accuracy: 0.8541 - val_loss: 0.8522 - val_accuracy: 0.7902 - 33ms/epoch - 11ms/step
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3568 - accuracy: 0.8559 - val_loss: 0.8647 - val_accuracy: 0.7972 - 39ms/epoch - 13ms/step
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3545 - accuracy: 0.8506 - val_loss: 0.8674 - val_accuracy: 0.7972 - 39ms/epoch - 13ms/step
Epoch 6/200

Epoch 00006: val_loss did not improve from 0.48740
3/3 - 0s - loss: 0.3601 - accuracy: 0.8506 - val_loss: 0.8796 - va

In [301]:
model2.evaluate(X_test, y_test)



[0.4088822305202484, 0.826815664768219]

- 은닉층 6개

In [306]:
model3 = Sequential([ 
    Dense(120, input_dim=6, activation='relu'),
    Dense(120, activation='relu'),
    Dense(100, activation='relu'),    
    Dense(100, activation='relu'),
    Dense(60, activation='relu'),
    Dense(48, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_255 (Dense)           (None, 120)               840       
                                                                 
 dense_256 (Dense)           (None, 120)               14520     
                                                                 
 dense_257 (Dense)           (None, 100)               12100     
                                                                 
 dense_258 (Dense)           (None, 100)               10100     
                                                                 
 dense_259 (Dense)           (None, 60)                6060      
                                                                 
 dense_260 (Dense)           (None, 48)                2928      
                                                                 
 dense_261 (Dense)           (None, 1)               

In [307]:
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history3 = model3.fit(X_train, y_train, validation_split=0.2, verbose=0,
                    epochs=100, batch_size=100)
model3.evaluate(X_test, y_test)



[0.41927123069763184, 0.832402229309082]

In [308]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience=20)

In [309]:
model_path = 'model/titanic_two.h5'
checkpoint = ModelCheckpoint(
    model_path, monitor='val_loss', verbose=0, save_best_only=True
)

In [310]:
history3 = model3.fit(X_train, y_train, validation_split=0.2, verbose=2,
                 epochs = 200, batch_size=200, 
                 callbacks = [checkpoint, early_stopping])

Epoch 1/200
3/3 - 0s - loss: 0.3577 - accuracy: 0.8524 - val_loss: 0.8201 - val_accuracy: 0.7902 - 108ms/epoch - 36ms/step
Epoch 2/200
3/3 - 0s - loss: 0.3573 - accuracy: 0.8524 - val_loss: 0.8288 - val_accuracy: 0.7902 - 37ms/epoch - 12ms/step
Epoch 3/200
3/3 - 0s - loss: 0.3559 - accuracy: 0.8489 - val_loss: 0.8453 - val_accuracy: 0.7972 - 39ms/epoch - 13ms/step
Epoch 4/200
3/3 - 0s - loss: 0.3553 - accuracy: 0.8471 - val_loss: 0.8408 - val_accuracy: 0.7902 - 39ms/epoch - 13ms/step
Epoch 5/200
3/3 - 0s - loss: 0.3548 - accuracy: 0.8489 - val_loss: 0.8449 - val_accuracy: 0.7972 - 43ms/epoch - 14ms/step
Epoch 6/200
3/3 - 0s - loss: 0.3535 - accuracy: 0.8506 - val_loss: 0.8460 - val_accuracy: 0.8042 - 47ms/epoch - 16ms/step
Epoch 7/200
3/3 - 0s - loss: 0.3569 - accuracy: 0.8506 - val_loss: 0.8484 - val_accuracy: 0.7832 - 41ms/epoch - 14ms/step
Epoch 8/200
3/3 - 0s - loss: 0.3541 - accuracy: 0.8541 - val_loss: 0.8571 - val_accuracy: 0.7902 - 39ms/epoch - 13ms/step
Epoch 9/200
3/3 - 0s - 

In [311]:
model3.evaluate(X_test, y_test)



[0.4224105179309845, 0.832402229309082]