<a href="https://colab.research.google.com/github/victorgau/Python_ML_DL/blob/master/4-02%20titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TITANIC生存分析

In [0]:
!wget https://github.com/victorgau/Python_ML_DL/raw/master/kaggle/titanic/train.csv -o /dev/null
!wget https://github.com/victorgau/Python_ML_DL/raw/master/kaggle/titanic/test.csv -o /dev/null
!wget https://github.com/victorgau/Python_ML_DL/raw/master/kaggle/titanic/gender_submission.csv -o /dev/null

## 載入需要的模組

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import pandas as pd

## 讀入資料

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv('gender_submission.csv')

## 先看一下資料內容

In [0]:
train.head()

## 看看缺失值的狀況

In [0]:
train.info()

In [0]:
test.info()

## 看看資料的分布狀況

In [0]:
train.describe()

In [0]:
test.describe()

## 從上面觀察的結果，選取自己想要的特徵

In [0]:
selected_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']

In [0]:
X_train = train[selected_features]

In [0]:
y_train = train['Survived']

In [0]:
X_test = test[selected_features]

## 處理缺失值

In [0]:
X_train.info()

In [0]:
X_test.info()

In [0]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna('S', inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

In [0]:
X_train.info()

## 將字串類型的類別資料做轉換

In [0]:
X_train = pd.get_dummies(X_train)

In [0]:
X_train.head()

In [0]:
X_test = pd.get_dummies(X_test)

In [0]:
X_test.head()

## 使用 Random Forest 來做分類

In [0]:
rfc = RandomForestClassifier()

## 使用 K-Fold Cross Validation 看一下分類的準確度

In [0]:
cross_val_score(rfc, X_train, y_train, cv=10).mean()

## 訓練模型

In [0]:
rfc.fit(X_train, y_train)

## 用訓練好的模型來做預測

In [0]:
survived_predict = rfc.predict(X_test)

## 將結果輸出成規定的格式

In [0]:
submission=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':survived_predict})

In [0]:
submission.to_csv('submission.csv', index=False)

## 試試看 XGBOOST

In [0]:
from xgboost import XGBClassifier

In [0]:
xgbc = XGBClassifier()

In [0]:
cross_val_score(xgbc, X_train, y_train, cv=10).mean()

In [0]:
xgbc.fit(X_train, y_train)

In [0]:
survived_predict = xgbc.predict(X_test)

In [0]:
submission=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':survived_predict})

In [0]:
submission.to_csv('submission.csv', index=False)

## 試試看使用 Keras

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [0]:
model = Sequential()

In [0]:
model.add(Dense(units=40, input_dim=10, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

In [0]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [0]:
train_history = model.fit(x=X_train, y=y_train, validation_split=0.1, epochs=30, batch_size=30)

In [0]:
model.predict_classes(X_test)