In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

#特徵工程
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [110]:
#去除無用數據欄
train_data = train_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [111]:
#去除部分資料缺失的數據
train_data = train_data.dropna(subset=["Embarked", "Age"])
train_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [112]:
pd.set_option('future.no_silent_downcasting', True) #消除錯誤資訊用

#將非數字數據轉成數字
train_data["Sex"] = train_data["Sex"].replace({'male': 0, 'female': 1}).astype(int)
train_data["Embarked"] = train_data["Embarked"].replace({'S': 1, 'C': 2, 'Q': 3}).astype(int)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,1
1,1,1,1,38.0,1,0,71.2833,2
2,1,3,1,26.0,0,0,7.925,1
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,1


In [113]:
X = train_data.drop('Survived', axis=1).to_numpy()
y = train_data['Survived'].to_numpy()
X.shape, y.shape

((712, 7), (712,))

In [114]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

#將部分數據當成驗證集
from sklearn.model_selection import train_test_split
X_train, X_ans, y_train, y_ans = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_ans.shape, y_train.shape, y_ans.shape

((569, 7), (143, 7), (569,), (143,))

In [115]:
tf.random.set_seed(42)

#定義神經網路
model_1 = tf.keras.Sequential([
           tf.keras.layers.Dense(7, activation='relu'),
           tf.keras.layers.Dense(10, activation='relu'),
           tf.keras.layers.Dense(30, activation='relu'),
           tf.keras.layers.Dense(2, activation='softmax')
])

model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                 metrics=['accuracy'])

history = model_1.fit(X_train, 
                      tf.one_hot(y_train, depth=2), 
                      epochs=250,
                      verbose = 1,
                      validation_data=(X_ans, tf.one_hot(y_ans, depth=2)))

Epoch 1/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4416 - loss: 0.7008 - val_accuracy: 0.4336 - val_loss: 0.7017
Epoch 2/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4789 - loss: 0.6969 - val_accuracy: 0.4545 - val_loss: 0.6990
Epoch 3/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4990 - loss: 0.6932 - val_accuracy: 0.4825 - val_loss: 0.6963
Epoch 4/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5608 - loss: 0.6897 - val_accuracy: 0.4755 - val_loss: 0.6939
Epoch 5/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5832 - loss: 0.6864 - val_accuracy: 0.5175 - val_loss: 0.6914
Epoch 6/250
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6024 - loss: 0.6831 - val_accuracy: 0.5385 - val_loss: 0.6892
Epoch 7/250
[1m18/18[0m [32m━━

In [116]:
#處理測試集
test_data = pd.read_csv("test.csv")
ID = test_data["PassengerId"].values
test_data = test_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
test_data["Sex"] = test_data["Sex"].replace({'male': 0, 'female': 1}).astype(int)
test_data["Embarked"] = test_data["Embarked"].replace({'S': 1, 'C': 2, 'Q': 3}).astype(int)
#將 Age 部分缺失的資料用平均值填補
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

sc = StandardScaler()
test_data = sc.fit_transform(test_data)
ans = model_1.predict(test_data).argmax(axis = 1)
output = pd.DataFrame({'PassengerId':ID, 'Survived': ans})
output.to_csv('submission.csv', index=False)
output

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [117]:
#查看測試集正確率
TRUE_ANS = pd.read_csv("gender_submission.csv")["Survived"].to_numpy()
acc = np.mean(ans == TRUE_ANS) * 100
print(f"Accuracy: {acc:.2f}%")

Accuracy: 90.43%
