# Chapter 15 KNN

### 找出觀察的最近鄰

In [4]:
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
features = iris.data # 不用target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [5]:
# 二最近鄰
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)
new_observation = [1, 1, 1, 1]

In [6]:
# 找出距離與觀察之最近鄰點的索引
distances, indices = nearest_neighbors.kneighbors([new_observation])
features_standardized[indices]

array([[[1.03800476, 0.55861082, 1.10378283, 1.18556721],
        [0.79566902, 0.32841405, 0.76275827, 1.05393502]]])

$$d_{euclidean}=\sqrt{\sum_{i=1}^n(x_i-y_i)^2}$$

$$d_{manhattan}=\sum_{i=1}^n|x_i-y_i|$$

#### 在預測的情況下，NearestNeighbors使用的事閔可夫司基距離(Minkowski distance)

$$d_{minkowski}=(\sum_{i=1}^n|x_i-y_i|^p)^{1/p}$$

In [19]:
# 依據歐幾里德距離找出二最近鄰
nearestneighbors_euclidean = NearestNeighbors(n_neighbors=2, metric='euclidean').fit(features_standardized)

In [20]:
distances

array([[0.49140089, 0.74294782]])

In [21]:
# 依據歐幾里德距離找出三最近鄰
nearestneighbors_euclidean = NearestNeighbors(n_neighbors=3, metric='euclidean').fit(features_standardized)

In [16]:
# 列出每個觀察的三個最近鄰(包含自己)
nearest_neighbors_with_self = nearestneighbors_euclidean.kneighbors_graph(features_standardized).toarray()

In [22]:
# 移除讓自己成為最近觀察的1
for i, x in enumerate(nearest_neighbors_with_self):
    x[i] = 0
nearest_neighbors_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

### KNN分類器製作

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [25]:
# 以五個鄰點訓練KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_std, y)
new_observation = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]]
knn.predict(new_observation)

array([1, 2])

#### Notes: 在KNN中，給定一觀察$x_u$，其目標類未知，演算法會依照某距離指標，先找到k個最接近的觀察(有時稱為$x_u$的鄰點)，然後再由這k個觀察再依據自己所屬類型投票選出$x_u$的預測類型。正式地來說，$x_u$為某類型$j$的機率是：
$$\frac{1}{k}\sum_{i \in v}I(y_i=j)$$

In [26]:
knn.predict_proba(new_observation)

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

#### 參數說明：
**metric**：設定距離量測指標。<br>
**n_jobs**：設定使用多少電腦運算核心。<br>
**algorithm**：設定用來計算最近鄰點的方法。KNNCLF通常會挑最好的來做。

### 找出最近鄰大小(n_neighbors=?)

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
features, target = iris.data, iris.target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [31]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
pipe = Pipeline([("scaler", scaler), ("knn", knn)])

In [34]:
# 產生候選值
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standardized, target)

In [37]:
# Results 最佳近鄰大小K
clf.best_estimator_.get_params()["knn__n_neighbors"]

6

#### Notes: 目的在於k值大小會影響到KNN分類器，而在學習當中，我們要試著去權衡bias與variance。例如：k=n，其中n為觀察數，則會有高bias與低variance。若k=1則會有低bias與高variance。

### 半徑型NN分類器製作

In [39]:
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
iris = datasets.load_iris()
features, target = iris.data, iris.target
scaler = StandardScaler()
feature_standardized = scaler.fit_transform(features)

In [40]:
# 訓練半徑NN分類器
rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1)
rnn.fit(feature_standardized, target)
new_observation = [[1, 1, 1, 1]]
rnn.predict(new_observation)

array([2])

# Chapter 16 邏輯迴歸

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
# 只載入二類型的資料
features, target = iris.data[:100, :], iris.target[:100]
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [44]:
# Logistic Regression
lr = LogisticRegression(solver="lbfgs", random_state=0)
model = lr.fit(features_standardized, target)

#### Notes: 在邏輯回歸中，會有一個線性模型(即$\beta_0+\beta_1x)$)包含在邏輯函式，$\frac{1}{1+e^{-z}}$中，即$P(y_i=1|X)=\frac{1}{1+e^-(\beta_0+\beta_1x)}$

In [45]:
new_observation = [[.5, .5, .5, .5]]
model.predict(new_observation)

array([1])

In [48]:
model.predict_proba(new_observation)

array([[0.17738424, 0.82261576]])

#### 我們的觀察有18%的機率是類型0，有82%是類型1。

### 訓練多類型分類器

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
features, target = iris.data, iris.target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [51]:
lr = LogisticRegression(solver="lbfgs", random_state=0, multi_class="ovr") # 需要加上此參數
model = lr.fit(features_standardized, target)

#### Notes: 
**OVR**：OVR為一對其他的邏輯迴歸，其中會有個別為每一類型訓練的模型，可預測一觀察是不是屬於該類型(變成二元了)。<br>
**MLR**：MLR的邏輯函式會被替換成歸一化指數函式(softmax function) <params: multi_class="multunomial"><br>
$$P(y_i=k|X)=\frac{e^{\beta_kx_i}}{\sum_{j=1}^{K}e^{\beta_jx_i}}$$

### 以正規化減少變異

#### 調整正規化強度超參數C

In [56]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

In [60]:
iris = datasets.load_iris()
features, target = iris.data, iris.target
scaler = StandardScaler()
feature_standardized = scaler.fit_transform(features)
lr = LogisticRegressionCV(penalty="l2", Cs=10, # Cs為C的搜尋範圍
                          random_state=0, n_jobs=-1, solver="lbfgs", multi_class="ovr", cv=5)
model = lr.fit(feature_standardized, target)

#### Notes: 正規化是一種懲罰複雜模型以降低其變異的方法。明確的說，會有一個懲罰項加進要做最小化的loss function中，通常是L1, L2。

$$L1=\alpha\sum_{j=1}^p|\hat{\beta_j}|$$

$$L2=\alpha\sum_{j=1}^p\hat{\beta_j}^2$$

$\alpha$越大複雜的模型懲罰越大，scikit-learn依循較普遍的做法，即使用C來代替$\alpha$，$C=\frac{1}{\alpha}$

### 超大資料集在分類器的訓練

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
features, target = iris.data, iris.target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

#### 這邊使用隨機平均梯度(stochastic average gradient，SAG)

In [65]:
lr = LogisticRegression(solver="sag", random_state=0, multi_class="ovr") # solver的改變你看得見
model = lr.fit(feature_standardized, target)

### 處理不平衡的分類

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import numpy as np
iris = datasets.load_iris()
features, target = iris.data[40:, :], iris.target[40:]
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [84]:
target = np.where((target==0), 0, 1)

In [87]:
lr = LogisticRegression(solver="lbfgs", multi_class="auto", random_state=0, class_weight="balanced")
model = lr.fit(features_standardized, target)

#### Notes: balanced參數能自動地為類型以與其出現次數成反比的方式加權：$w_j=\frac{n}{kn_j}$