## 1-4. 異常值

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.read_csv('ex1.csv')

imp = SimpleImputer(missing_values=np.nan, 
                    strategy='most_frequent')
data = imp.fit_transform(df)
df = pd.DataFrame(data, columns=df.columns)
df.head(3)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
0,1,妙蛙種子,Grass,Poison,45.0,49,49,65,65,45,1,False
1,2,妙蛙草,Grass,Poison,60.0,62,63,80,80,60,1,False
2,3,妙蛙花,Grass,Poison,80.0,82,83,100,100,80,1,False


In [2]:
from sklearn.covariance import EllipticEnvelope

X = df.loc[:, 'HP':'Speed']
# 建立偵測器
outlier_detect = EllipticEnvelope(contamination=0.01)
# 擬合並找出離群值
result = outlier_detect.fit_predict(X)
# 取出離群值的索引值(離群值標示為 -1)
idx = np.where(result == -1)[0]
df.take(idx)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
98,91,刺甲貝,Water,Ice,50.0,95,180,85,45,70,1,False
121,113,吉利蛋,Normal,Flying,250.0,5,5,35,105,50,1,False


In [3]:
def outlier_idx(x):
    q1, q3 = np.percentile(x, [25, 75])
    IQR = q3 - q1
    lower_bound = q1 - 1.5*IQR
    upper_bound = q3 + 1.5*IQR
    return np.where((x < lower_bound) | (x > upper_bound))

# 找出 HP 的離群值
idx = outlier_idx(df['HP'])[0]
df.take(idx)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
45,40,胖可丁,Normal,Fairy,140.0,70,45,85,50,45,1,False
121,113,吉利蛋,Normal,Flying,250.0,5,5,35,105,50,1,False
142,131,拉普拉斯,Water,Ice,130.0,85,80,85,95,60,1,False
145,134,水伊布,Water,Flying,130.0,65,60,110,95,65,1,False
155,143,卡比獸,Normal,Flying,160.0,110,65,65,110,30,1,False


In [4]:
from sklearn.ensemble import IsolationForest

X = df.loc[:, 'HP':'Speed']
clf = IsolationForest(max_samples=df.shape[0], 
                      contamination=0.01)
clf.fit(X)
y_pred_train = clf.predict(X)
# 取出離群值的索引值(離群值標示為 -1)
idx = np.where(y_pred_train == -1)[0]
df.take(idx)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
121,113,吉利蛋,Normal,Flying,250.0,5,5,35,105,50,1,False
164,150,超夢MegaY,Psychic,Flying,106.0,150,70,194,120,140,1,True


In [5]:
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=20, 
                         contamination=0.01)
y_pred_train = clf.fit_predict(X)
# 取出離群值的索引值(離群值標示為 -1)
idx = np.where(y_pred_train == -1)[0]
df.take(idx)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
121,113,吉利蛋,Normal,Flying,250.0,5,5,35,105,50,1,False
164,150,超夢MegaY,Psychic,Flying,106.0,150,70,194,120,140,1,True


In [6]:
# 假設檢測到兩個疑似離群值的列索引值
idx = [121, 164]
df_ok = df.iloc[[i for i in df.index if i not in idx], :]
df_ok.shape

(166, 12)

In [7]:
df['Outlier'] = [1 if i in idx else 0 for i in df.index]
df.take([0] + idx)

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary,Outlier
0,1,妙蛙種子,Grass,Poison,45.0,49,49,65,65,45,1,False,0
121,113,吉利蛋,Normal,Flying,250.0,5,5,35,105,50,1,False,1
164,150,超夢MegaY,Psychic,Flying,106.0,150,70,194,120,140,1,True,1


In [8]:
# 假設兩個離群值位於特徵 HP
df['Log_HP'] = [np.log(x) for x in df['HP']]
df.iloc[[0] + idx, [0, 1, 2, 3, 4, 12, 13]]

Unnamed: 0,Number,Name,Type1,Type2,HP,Outlier,Log_HP
0,1,妙蛙種子,Grass,Poison,45.0,0,3.806662
121,113,吉利蛋,Normal,Flying,250.0,1,5.521461
164,150,超夢MegaY,Psychic,Flying,106.0,1,4.663439


In [10]:
df_ok

Unnamed: 0,Number,Name,Type1,Type2,HP,Attack,Defense,SpecialAtk,SpecialDef,Speed,Generation,Legendary
0,1,妙蛙種子,Grass,Poison,45.0,49,49,65,65,45,1,False
1,2,妙蛙草,Grass,Poison,60.0,62,63,80,80,60,1,False
2,3,妙蛙花,Grass,Poison,80.0,82,83,100,100,80,1,False
3,3,妙蛙花Mega,Grass,Poison,80.0,100,123,122,120,80,1,False
4,4,小火龍,Fire,Flying,39.0,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
162,150,超夢,Psychic,Flying,106.0,110,90,154,90,130,1,True
163,150,超夢MegaX,Psychic,Fighting,106.0,190,100,154,100,130,1,True
165,151,夢幻,Psychic,Flying,100.0,100,100,100,100,100,1,False
166,153,月桂葉,Grass,Flying,65.0,62,80,63,80,60,2,False


In [11]:
from sklearn import svm

# 參數 nu 越小代表決策邊界越大
clf = svm.OneClassSVM(nu=0.01, kernel="rbf")
clf.fit(df_ok.loc[:, 'HP':'Speed'])

# 三筆新觀察值
new = [['152', '小鋸鱷', 'Water', '', '50', '65', '64', 
        '44', '48', '43', '2', 'FALSE'], 
      ['242', '幸福蛋', 'Normal', '', '255', '10', '10', 
       '75', '135', '55', '2', 'FALSE'],
      ['250', '鳳王', 'Fire', 'Flying', '106', '130', 
       '90', '110', '154', '90', '2', 'TRUE']]
df_new = pd.DataFrame(new, columns=df_ok.columns)

# 回傳 1 代表正常值；-1 代表異常值
clf.predict(df_new.loc[:, 'HP':'Speed'])

array([ 1, -1, -1], dtype=int64)

In [12]:
clf = LocalOutlierFactor(n_neighbors=20,
                         novelty=True,
                         contamination=0.1)
clf.fit(df_ok.loc[:, 'HP':'Speed'])
clf.predict(df_new.loc[:, 'HP':'Speed'])

array([ 1, -1, -1])