* [1-2. 遺漏值](#sec2)
    * [1-2-1. 偵測與刪除遺漏值](#sec2)
    * [1-2-2. 填補遺漏值](#sec2_2)
* [1-3. 切割數據集](#sec3) 
    * [1-3-1. 訓練、驗證與測試集](#sec3)
    * [1-3-2. k次交叉驗證](#sec3_2)
***

<a id='sec2'></a>
## 1-2. 遺漏值
#### 1-2-1. 偵測與刪除遺漏值

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('ex1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Number      168 non-null    int64  
 1   Name        168 non-null    object 
 2   Type1       168 non-null    object 
 3   Type2       79 non-null     object 
 4   HP          166 non-null    float64
 5   Attack      168 non-null    int64  
 6   Defense     168 non-null    int64  
 7   SpecialAtk  168 non-null    int64  
 8   SpecialDef  168 non-null    int64  
 9   Speed       168 non-null    int64  
 10  Generation  168 non-null    int64  
 11  Legendary   168 non-null    bool   
dtypes: bool(1), float64(1), int64(7), object(3)
memory usage: 14.7+ KB


In [2]:
df[df['HP'].isna()]

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
166,153,月桂葉,Grass,,,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,Flying,,35,50,55,110,85,2,False


In [3]:
df.loc[167, 'Type2'] = np.nan
df[df['HP'].isna()]

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
166,153,月桂葉,Grass,,,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,,,35,50,55,110,85,2,False


In [4]:
# 刪除特徵(行) Type2
print(df.shape)
df_drop = df.drop('Type2', axis=1)
df_drop.head(2)

(168, 12)


Unnamed: 0,Number,Name,Type1,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
0,1,妙蛙種子,Grass,45.0,49,49,65,65,45,1,False
1,2,妙蛙草,Grass,60.0,62,63,80,80,60,1,False


In [5]:
# 刪除樣本(列)
df_drop = df.dropna()   # 刪除所有含 nan 的列
# df_drop = df.dropna(thresh=11) # 刪除不到 11 個非遺漏值的列
df_drop.shape

(78, 12)

In [6]:
# 改用取出符合條件的樣本(列)
df_type2 = df[df['Type2'].notna()]
df_type2.shape

(78, 12)

In [7]:
df[df['HP'].isna()]

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
166,153,月桂葉,Grass,,,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,,,35,50,55,110,85,2,False


<a id='sec2_2'></a>
#### 1-2-2. 填補遺漏值

In [8]:
df.fillna(0).tail(2)    # 填補 0

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
166,153,月桂葉,Grass,0,0.0,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,0,0.0,35,50,55,110,85,2,False


In [9]:
df.fillna(method='ffill').tail(3)   # 往後填補

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
165,151,夢幻,Psychic,Fighting,100.0,100,100,100,100,100,1,False
166,153,月桂葉,Grass,Fighting,100.0,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,Fighting,100.0,35,50,55,110,85,2,False


In [10]:
from sklearn.impute import SimpleImputer

# 用眾數進行填補，好處是能填補非數值型數據
imp = SimpleImputer(missing_values=np.nan, 
                    strategy='most_frequent')
data = imp.fit_transform(df)  # 填補後回傳陣列
df_imp = pd.DataFrame(data, columns=df.columns)
df_imp.tail(3)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
165,151,夢幻,Psychic,Flying,100,100,100,100,100,100,1,False
166,153,月桂葉,Grass,Flying,65,62,80,63,80,60,2,False
167,166,安瓢蟲,Bug,Flying,65,35,50,55,110,85,2,False


In [11]:
from sklearn.impute import KNNImputer

imp = KNNImputer(n_neighbors=5)
# 取出所有數值型特徵，再進行填補
data = imp.fit_transform(df.loc[:, 'HP':'Speed'])
df_imp = df
df_imp.loc[:, 'HP':'Speed'] = data
df_imp.tail(2)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
166,153,月桂葉,Grass,,68.0,62.0,80.0,63.0,80.0,60.0,2,False
167,166,安瓢蟲,Bug,,61.0,35.0,50.0,55.0,110.0,85.0,2,False


<a id='sec3'></a>
## 1-3. 切割數據集
#### 1-3-1. 訓練、驗證與測試集

In [12]:
from sklearn.model_selection import train_test_split

X = df_imp.loc[:, 'HP':'Speed']   # 特徵
y = df_imp['Type1']               # 目標類別
# 切割數據集，其中
# X_train, y_train -> 訓練集與其類別標籤(比例=0.8)
# X_test, y_test   -> 測試集與其類別標籤(比例=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)
# 觀看數據集、訓練與測試集的類別比例
df_count = pd.concat([y.value_counts(), 
                      y_train.value_counts(), 
                      y_test.value_counts()], axis=1)
df_count.columns = ['y', 'y_train', 'y_test']
df_count.head()

Unnamed: 0,y,y_train,y_test
Water,31,25,6.0
Normal,24,19,5.0
Bug,15,12,3.0
Fire,14,11,3.0
Poison,14,11,3.0


<a id='sec3_2'></a>
#### 1-3-2. k次交叉驗證

In [33]:
X = df_imp.loc[:, 'HP':'Speed']
y = df_imp['Legendary']         # 目標改為判斷是否為神獸
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
# 分層 k 次
kfold = StratifiedKFold(n_splits=10, shuffle=True, 
                        random_state=42).split(X_train, 
                                               y_train)
score_lst = []  # 紀錄 k 次交叉驗證的正確率

for k, (i_train, i_valid) in enumerate(kfold):
    # 初始化 kNN 分類器
    knn = KNeighborsClassifier(n_neighbors=2)
    knn.fit(X_train.iloc[i_train, :],
            y_train.iloc[i_train])
    # 以驗證集評估正確率
    score = knn.score(X_train.iloc[i_valid, :], 
                      y_train.iloc[i_valid])
    score_lst.append(score)
    print('%2d-Fold: Acc=%.3f' % (k+1, score))

print('\n10-fold CV accuracy = %.3f, std = %.3f' % 
      (np.mean(score_lst), np.std(score_lst)))

 1-Fold: Acc=0.929
 2-Fold: Acc=0.929
 3-Fold: Acc=0.929
 4-Fold: Acc=0.929
 5-Fold: Acc=1.000
 6-Fold: Acc=1.000
 7-Fold: Acc=1.000
 8-Fold: Acc=1.000
 9-Fold: Acc=0.923
10-Fold: Acc=0.923

10-fold CV accuracy = 0.956, std = 0.036




In [45]:
from sklearn.model_selection import cross_val_score

knn = KNeighborsClassifier(n_neighbors=2)
score_lst = cross_val_score(estimator=knn, 
                            X=X_train, y=y_train, 
                            cv=10, n_jobs=-1)
print('10-fold CV accuracy scores\n', score_lst)
print('\n10-fold CV accuracy = %.3f, std = %.3f' % 
      (np.mean(score_lst), np.std(score_lst)))

10-fold CV accuracy scores
 [0.92857143 0.92857143 0.92857143 0.92857143 1.         1.
 1.         1.         1.         0.84615385]

10-fold CV accuracy = 0.956, std = 0.050


