# K-Nearest Neighbors（KNN）

## step1: 準備訓練資料

In [1]:
import pandas as pd

train_df = pd.read_csv("titanic/train.csv", encoding="utf-8")
test_df = pd.read_csv("titanic/test.csv", encoding="utf-8")

### 觀察特徵類型：
- 無用：PassengerId(刪)
- 答案：Survived
- 數值：Age, SibSp, Parch, Fare
- 類別：Pclass, Name(取出稱謂salutation), Sex, Ticket(刪，不知怎麼用), Cabin(刪，缺失值太多), Embarked

In [2]:
# 統計每個欄位缺失值（NA）數量：sum(False=0, True=1)
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# 填補缺失值_訓練資料：數值型，填中位數
med = train_df.median()
train_df = train_df.fillna(med)

# 填補缺失值_測試資料：數值型，用訓練資料算出來的中位數
test_df = test_df.fillna(med)

In [4]:
# 填補缺失值_訓練資料：類別型，填出現最多的選項
most = train_df["Embarked"].value_counts().idxmax()
train_df["Embarked"] = train_df["Embarked"].fillna(most)

# 填補缺失值_測試資料：類別型，用訓練資料找出來的的選項
test_df["Embarked"] = test_df["Embarked"].fillna(most)

In [5]:
def namecut(s):
    return s.split(",")[-1].split(".")[0].strip()

# 統計稱謂出現的的次數
salutation = train_df["Name"].apply(namecut).value_counts()
# 列出數量大於30的稱謂
reserved = salutation[salutation > 30].index
reserved

Index(['Mr', 'Miss', 'Mrs', 'Master'], dtype='object')

In [6]:
def namecut_2(s):
    s = s.split(",")[-1].split(".")[0].strip()
    if s in reserved:
        return s 
    else:
        return None

# namecut_2：稱謂數量大於30的保留，小於30的改為None
# reserved ：數量大於30的稱謂 = ['Mr', 'Miss', 'Mrs', 'Master']
train_df["Name"] = train_df["Name"].apply(namecut_2)
test_df["Name"] = test_df["Name"].apply(namecut_2)

In [7]:
# 對類別型特徵做 One-Hot Encoding
train_raw = pd.get_dummies(train_df, columns=["Name", "Sex", "Embarked"])
test_raw = pd.get_dummies(test_df, columns=["Name", "Sex", "Embarked"])

In [8]:
# 資料清洗 Data Cleansing
x_train = train_raw.drop(["PassengerId", "Survived", "Ticket", "Cabin"], axis=1)
y_train = train_raw["Survived"]
x_test = test_raw.drop(["PassengerId", "Ticket", "Cabin"], axis=1)
testid = test_raw["PassengerId"]

### 資料特徵縮放 Feature Scaling
- 和「距離」相關的演算法，為了讓基數/單位一致，所以需要 scaling
- 例如：KMeans、KNN

In [9]:
# 資料特徵縮放 Feature Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_scaler = scaler.fit_transform(x_train)
x_test_scaler = scaler.transform(x_test)

## step2: 建立訓練模型

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

clf = KNeighborsClassifier()
params = {
    "n_neighbors":range(5,100)
}
cv = GridSearchCV(clf, params, cv=10, n_jobs=4)
cv.fit(x_train_scaler, y_train)

print("best params: ", cv.best_params_)
print("best score: ", cv.best_score_)

best params:  {'n_neighbors': 22}
best score:  0.8193508114856428


## step3: 利用模型預測

In [11]:
pre = cv.best_estimator_.predict(x_test_scaler)
result_df = pd.DataFrame({
    "PassengerId":testid,
    "Survived":pre
})
result_df.to_csv("titanic/predict_result_KNN.csv", index=False, encoding="utf-8")
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
