In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
regr = linear_model.LinearRegression()

# 將訓練資料丟進去模型訓練
regr.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

In [3]:
print(regr.coef_)

[  33.40877011 -292.24672884  481.07153405  369.06269614 -966.37849405
  589.81383056  232.61924401  288.3263166   802.72704593   37.81285219]


In [4]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 2939.42


### LASSO

In [5]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
lasso = linear_model.Lasso(alpha=1.0)

# 將訓練資料丟進去模型訓練
lasso.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = lasso.predict(x_test)

In [6]:
# 印出各特徵對應的係數，可以看到許多係數都變成 0，Lasso Regression 的確可以做特徵選取
lasso.coef_

array([  0.        ,  -0.        , 321.203877  ,  57.74744332,
         0.        ,   0.        ,  -0.        ,   0.        ,
       332.41817196,   0.        ])

In [7]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 3505.84


### Ridge

In [8]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
ridge = linear_model.Ridge(alpha=1.0)

# 將訓練資料丟進去模型訓練
ridge.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

In [9]:
# 印出 Ridge 的參數，可以很明顯看到比起 Linear Regression，參數的數值都明顯小了許多
print(ridge.coef_)

[  48.8125786   -85.49511577  270.22532535  201.91767903   17.41308665
  -19.04346706 -136.47737574  122.26503311  247.60074795   95.59855598]


In [10]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 2939.42


可以看見 LASSO 與 Ridge 的結果並沒有比原本的線性回歸來得好，
這是因為目標函數被加上了正規化函數，讓模型不能過於複雜，相當於限制模型擬和資料的能力。因此若沒有發現 Over-fitting 的情況，是可以不需要一開始就加上太強的正規化的。

## [作業重點]
使用 Sklearn 中的 Lasso, Ridge 模型，來訓練各種資料集，務必了解送進去模型訓練的**資料型態**為何，也請了解模型中各項參數的意義。

機器學習的模型非常多種，但要訓練的資料多半有固定的格式，確保你了解訓練資料的格式為何，這樣在應用新模型時，就能夠最快的上手開始訓練！

## 練習時間
請使用其他資料集 (boston, wine)，並調整不同的 alpha 來觀察模型訓練的情形。

### Boston House-Prices Dataset (Regression)

In [11]:
boston = datasets.load_boston()
print("Shape: ", boston.data.shape)
print("Features: ", boston.feature_names)

Shape:  (506, 13)
Features:  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [12]:
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=4)

linear = linear_model.LinearRegression()

linear.fit(x_train, y_train)

y_pred = linear.predict(x_test)

In [13]:
linear.coef_

array([-1.15966452e-01,  4.71249231e-02,  8.25980146e-03,  3.23404531e+00,
       -1.66865890e+01,  3.88410651e+00, -1.08974442e-02, -1.54129540e+00,
        2.93208309e-01, -1.34059383e-02, -9.06296429e-01,  8.80823439e-03,
       -4.57723846e-01])

In [14]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 25.42


### Lasso

In [15]:
lasso = linear_model.Lasso(alpha=1.0)

lasso.fit(x_train, y_train)

y_pred = lasso.predict(x_test)

In [16]:
# 印出各特徵對應的係數，可以看到許多係數都變成 0，Lasso Regression 的確可以做特徵選取
lasso.coef_

array([-0.06494981,  0.04581458, -0.        ,  0.        , -0.        ,
        1.18140024,  0.01109101, -0.73695809,  0.23350042, -0.01551065,
       -0.69270805,  0.00763157, -0.6927848 ])

In [17]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 28.95


#### Lasso Tune Hyperparameters (Regularization Strength alpha)

In [18]:
lasso_MSE = {}
gap = np.linspace(0, 10, 101) #0, 0.1, 0.2, ..., 10

for x in gap:
    lasso = linear_model.Lasso(alpha=x)

    lasso.fit(x_train, y_train)

    y_pred = lasso.predict(x_test)
    
    lasso_MSE[x] = round(mean_squared_error(y_test, y_pred), 2)

  import sys
  positive)
  positive)


In [19]:
import pandas as pd

lasso_MSE = sorted(lasso_MSE.items(), key=lambda kv: kv[1])
sheet = pd.DataFrame(lasso_MSE)
sheet = sheet.rename(columns={0:'alpha', 1:'MSE'})
sheet

Unnamed: 0,alpha,MSE
0,0.0,25.42
1,0.1,26.45
2,0.2,26.60
3,0.3,26.65
4,0.4,26.76
5,0.5,26.94
6,0.6,27.22
7,0.7,27.59
8,0.8,27.98
9,0.9,28.43


### Ridge

In [20]:
ridge = linear_model.Ridge(alpha=1.0)

ridge.fit(x_train, y_train)

y_pred = ridge.predict(x_test)

In [21]:
# 印出 Ridge 的參數，可以很明顯看到比起 Linear Regression，參數的數值都明顯小了許多
ridge.coef_

array([-1.12499445e-01,  4.79562332e-02, -2.40438147e-02,  2.96199458e+00,
       -9.33966118e+00,  3.93079015e+00, -1.73821202e-02, -1.43347691e+00,
        2.75239392e-01, -1.38920708e-02, -8.31116943e-01,  9.15637729e-03,
       -4.66460539e-01])

In [22]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 25.74


#### Ridge Tune Hyperparameters (Regularization Strength alpha)

In [23]:
ridge_MSE = {}

for x in gap:
    ridge = linear_model.Lasso(alpha=x)

    ridge.fit(x_train, y_train)

    y_pred = ridge.predict(x_test)
    
    ridge_MSE[x] = round(mean_squared_error(y_test, y_pred), 2)

  
  positive)
  positive)


In [24]:
ridge_MSE = sorted(ridge_MSE.items(), key=lambda kv: kv[1])
sheet = pd.DataFrame(ridge_MSE)
sheet = sheet.rename(columns={0:'alpha', 1:'MSE'})
sheet

Unnamed: 0,alpha,MSE
0,0.0,25.42
1,0.1,26.45
2,0.2,26.60
3,0.3,26.65
4,0.4,26.76
5,0.5,26.94
6,0.6,27.22
7,0.7,27.59
8,0.8,27.98
9,0.9,28.43


### 結論<br>
- Linear Regression MSE: 25.42
- Lasso MSE: 28.95
- Ridge MSE: 25.74
- 在Boston Dataset Regularization 強度愈高 MSE 愈高

### Wine Dataset (Classification)

In [25]:
wine = datasets.load_wine()
print("Class: ", wine.target_names)
print("Shape: ", wine.data.shape)
print("Features: ", wine.feature_names)

Class:  ['class_0' 'class_1' 'class_2']
Shape:  (178, 13)
Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


### Logistic Regression + L1 Regularization C=1.0
- penalty : str, ‘l1’, ‘l2’, ‘elasticnet’ or ‘none’, optional (default=’l2’)
- C : float, optional (default=1.0) &emsp; Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

In [26]:
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=4)

LR_L1 = linear_model.LogisticRegression(penalty='l1', C=1.0)

LR_L1.fit(x_train, y_train)

y_pred = LR_L1.predict(x_test)



In [27]:
LR_L1.coef_

array([[-5.63508710e-01,  6.27499902e-01,  8.56613119e-01,
        -5.53220856e-01, -4.32785071e-02,  0.00000000e+00,
         1.21303292e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  7.03594989e-01,
         1.77949145e-02],
       [ 1.07295863e+00, -1.11065398e+00,  0.00000000e+00,
         2.08255236e-01,  7.78607045e-03,  0.00000000e+00,
         4.62976388e-01,  0.00000000e+00,  5.20817090e-01,
        -2.03981772e+00,  7.20650532e-01,  0.00000000e+00,
        -1.46553809e-02],
       [-1.19823305e-01,  2.76294242e-01,  0.00000000e+00,
         9.23617277e-02,  7.44891121e-03,  0.00000000e+00,
        -2.89998760e+00,  0.00000000e+00,  0.00000000e+00,
         9.72307059e-01,  0.00000000e+00, -1.52886011e+00,
        -1.09659903e-04]])

In [28]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9722222222222222


#### Logistic Regression + L1 Tune Hyperparameters (Regularization Strength alpha)

In [29]:
lasso_auc = {}
gap = [0.001, 0.01, 0.1, 1, 10, 100]

for x in gap:
    LR_L1 = linear_model.LogisticRegression(penalty='l1', C=x)
    
    LR_L1.fit(x_train, y_train)
    
    y_pred = LR_L1.predict(x_test)
    
    lasso_auc[x] = round(accuracy_score(y_test, y_pred), 2)



In [30]:
lasso_auc = sorted(lasso_auc.items(), key=lambda kv: kv[1])
sheet = pd.DataFrame(lasso_auc)
sheet = sheet.rename(columns={0:'Reverse_alpha', 1:'AUC'})
sheet

Unnamed: 0,Reverse_alpha,AUC
0,0.001,0.56
1,0.01,0.58
2,0.1,0.92
3,1.0,0.97
4,10.0,0.97
5,100.0,1.0


### Logistic Regression + L2 Regularization C=1.0 Default Setting

In [31]:
ridge = linear_model.LogisticRegression(penalty='l2', C=1.0) #default setting

ridge.fit(x_train, y_train)

y_pred = ridge.predict(x_test)



In [32]:
ridge.coef_

array([[-6.82864779e-01,  7.19709566e-01,  9.78123238e-01,
        -5.71326897e-01, -3.15688084e-02,  3.00522775e-01,
         1.11716506e+00, -3.43549778e-02, -4.90150215e-01,
        -1.05374113e-02, -1.54185796e-01,  9.61331414e-01,
         1.81479366e-02],
       [ 9.32405991e-01, -1.02836307e+00, -7.03687526e-01,
         2.35034368e-01,  8.51406104e-03,  7.62359762e-02,
         4.71638459e-01,  5.60638803e-01,  6.15085511e-01,
        -1.81947987e+00,  9.33098198e-01,  7.36442197e-02,
        -1.40242413e-02],
       [-4.72180741e-01,  6.31034394e-01, -6.36847579e-02,
         1.56380289e-01,  3.13408128e-02, -7.52374558e-01,
        -1.62587954e+00, -1.31786834e-01, -7.01391158e-01,
         1.03384290e+00, -4.87953685e-01, -1.15357424e+00,
         1.40302540e-04]])

In [33]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9722222222222222


#### Logistic Regression + L2 Tune Hyperparameters (Regularization Strength alpha)

In [34]:
ridge_auc = {}
gap = [0.001, 0.01, 0.1, 1, 10, 100]

for x in gap:
    LR_L2 = linear_model.LogisticRegression(penalty='l2', C=x)
    
    LR_L2.fit(x_train, y_train)
    
    y_pred = LR_L2.predict(x_test)
    
    ridge_auc[x] = round(accuracy_score(y_test, y_pred), 2)



In [35]:
ridge_auc = sorted(ridge_auc.items(), key=lambda kv: kv[1])
sheet = pd.DataFrame(ridge_auc)
sheet = sheet.rename(columns={0:'Reverse_alpha', 1:'AUC'})
sheet

Unnamed: 0,Reverse_alpha,AUC
0,0.001,0.67
1,0.01,0.86
2,0.1,0.94
3,1.0,0.97
4,10.0,0.97
5,100.0,0.97


### 結論<br>
- Default Logistic Regression + L2 + C=1.0 &emsp; AUC: 97.2%
- Logistic Regression + L1 + C=1.0 &emsp; AUC: 97.2%
- C愈大 Regularization的強度愈低 (不同於Lasso & Ridge的alpha)
- 在Wine Dataset Regularization強度愈高 AUC愈低