## 將 Age 的遺漏值以中位數填補

In [1]:
import pandas as pd

url = "https://storage.googleapis.com/py_ml_datasets/train.csv"
train = pd.read_csv(url)
train["Age"] = train["Age"].fillna(train["Age"].median())

## 將 Embarked 的遺漏值以 S 填補

In [2]:
train["Embarked"] = train["Embarked"].fillna("S")
print train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


## Sex 與 Embarked 由類別型變數轉換為 One-hot encoding 的 dummy variables

In [3]:
X = train[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = train["Survived"]
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1


## 將資料以 7:3 比例分割成訓練與測試資料

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## 任意選三個分類器利用訓練資料建立 h

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

h1 = LogisticRegression()
h2 = DecisionTreeClassifier()
h3 = KNeighborsClassifier()

h1 = h1.fit(X_train, y_train)
h2 = h2.fit(X_train, y_train)
h3 = h3.fit(X_train, y_train)

## 比較這三個分類器在測試資料的準確率

In [6]:
from sklearn.metrics import accuracy_score

y_hat_h1 = h1.predict(X_test)
y_hat_h2 = h2.predict(X_test)
y_hat_h3 = h3.predict(X_test)

acc_h1 = accuracy_score(y_test, y_hat_h1)
acc_h2 = accuracy_score(y_test, y_hat_h2)
acc_h3 = accuracy_score(y_test, y_hat_h3)

labels = ["Logistic Regression", "Decision Tree", "KNN"]
accs = [acc_h1, acc_h2, acc_h3]

for (label, acc) in zip(labels, accs):
    print "[%s] Accuracy: %.2f%%" % (label, acc*100)

[Logistic Regression] Accuracy: 77.24%
[Decision Tree] Accuracy: 76.12%
[KNN] Accuracy: 71.64%
