In [1]:
from tensorflow.keras import models, layers
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df=pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.drop(['PassengerId','Name', 'Ticket', 'Cabin', 'Embarked', 'Fare'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [6]:
df.shape

(891, 6)

In [7]:
df.isnull().sum().sort_values(ascending=False).head(20)

Age         177
Survived      0
Pclass        0
Sex           0
SibSp         0
Parch         0
dtype: int64

In [8]:
df=pd.get_dummies(df) #one-hot encoding 방식으로 펼쳐주니까 col개수 증가
df.shape

(891, 7)

In [9]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputed = imputer.fit_transform(df)
df_imputed=pd.DataFrame(imputed, columns = df.columns)
df_imputed

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male
0,0.0,3.0,22.0,1.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,1.0,0.0
2,1.0,3.0,26.0,0.0,0.0,1.0,0.0
3,1.0,1.0,35.0,1.0,0.0,1.0,0.0
4,0.0,3.0,35.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,1.0,0.0
888,0.0,3.0,29.6,1.0,2.0,1.0,0.0
889,1.0,1.0,26.0,0.0,0.0,0.0,1.0


In [10]:
X=df_imputed.iloc[:,1:]
y=df_imputed.iloc[:, 0]

In [11]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male
0,3.0,22.0,1.0,0.0,0.0,1.0
1,1.0,38.0,1.0,0.0,1.0,0.0
2,3.0,26.0,0.0,0.0,1.0,0.0
3,1.0,35.0,1.0,0.0,1.0,0.0
4,3.0,35.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
886,2.0,27.0,0.0,0.0,0.0,1.0
887,1.0,19.0,0.0,0.0,1.0,0.0
888,3.0,29.6,1.0,2.0,1.0,0.0
889,1.0,26.0,0.0,0.0,0.0,1.0


In [12]:
y

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [13]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, shuffle=True)

In [14]:
model=models.Sequential()
model.add(layers.Dense(30, input_dim=6, activation='relu'))
model.add(layers.Dense(12, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 30)                210       
                                                                 
 dense_1 (Dense)             (None, 12)                372       
                                                                 
 dense_2 (Dense)             (None, 8)                 104       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 695
Trainable params: 695
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history=model.fit(X, y, epochs=200, batch_size=5, validation_split=0.25, verbose=False) #각 epoch 마다 검증

In [16]:
score=model.evaluate(X_test, y_test) #test
print('Test Accuracy: %.4f'%(score[1])) #accuracy

Test Accuracy: 0.8547
