# sikitlearnを試す
https://www.codexa.net/scikit-learn-intro/

# 教師あり学習:分類問題

# Irisデータを読み込む
アヤメの種類に関するデータセットで、アヤメの種類の予測を行います。まずは、データを読み込んで中身を確認した後に、訓練データとテストデータに分割しましょう。

In [1]:
import pandas as pd
import sklearn
from sklearn.datasets import load_iris, load_boston

In [4]:
#irisデータセットの読み込み
iris = load_iris()
#特徴量をデータフレームに格納して、最初の五行を表示
iris_features = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_features.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
#特徴量の基本統計量を確認
iris_features.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
#特徴量の欠損値の数を確認
iris_features.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [8]:
#ラベルをシリーズに格納
iris_label = pd.Series(iris.target)

#ラベルごとのデータ数を表示
iris_label.value_counts()
# 0、1、2のラベルがそれぞれ50ずつあるようですね。これらの数字はそれぞれがアヤメの種類に対応しています。

0    50
1    50
2    50
dtype: int64

In [12]:
#iris.target
iris_label

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Length: 150, dtype: int64

# 学習データとテストデータに分割

In [13]:
from sklearn.model_selection import train_test_split
features_train, features_test, label_train, label_test = train_test_split(
    iris_features, iris_label, test_size=0.5, random_state=0)

In [14]:
#学習データとテストデータの構造を確認
print(features_train.shape)
print(features_test.shape)
print(label_train.shape)
print(label_test.shape)

(75, 4)
(75, 4)
(75,)
(75,)


# 分類モデルの実装

In [16]:
from sklearn import svm

#LinearSVMのインスタンスを作成
Linsvc = svm.LinearSVC(random_state=0, max_iter=3000)

#LinearSVMでデータを学習
Linsvc.fit(features_train,label_train)

LinearSVC(max_iter=3000, random_state=0)

In [17]:
#LinearSVMでアヤメの種類を予測
label_pred_Linsvc = Linsvc.predict(features_test)
print(label_pred_Linsvc)

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 2 0 0 2 0 0 1 1 0 2 2 0 2 2 1 0
 2 1 1 2 0 2 0 0 1 2 2 1 2 1 2 2 1 2 2 2 2 1 2 2 0 2 1 1 1 1 2 0 0 2 1 0 0
 1]


In [18]:
#参考: K近傍法
from sklearn.neighbors import KNeighborsClassifier

#K近傍法のインスタンスを作成
Kneighbor = KNeighborsClassifier(n_neighbors=5)

#K近傍法でデータを学習
Kneighbor.fit(features_train, label_train)

#K近傍法でアヤメの種類を予測
label_pred_KNeighbor = Kneighbor.predict(features_test)
print(label_pred_KNeighbor)

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0 1 2 2 1 2 1 2 1 1 2 2 1 2 1 2 1 0 2 1 1 1 1 2 0 0 2 1 0 0
 1]


In [19]:
#参考: ロジスティック回帰
from sklearn.linear_model import LogisticRegression

#ロジスティック回帰のインスタンスを作成
LogReg = LogisticRegression(random_state=0)

#ロジスティック回帰でデータを学習
LogReg.fit(features_train, label_train)

#ロジスティック回帰でアヤメの種類を予測
label_pred_LogReg = LogReg.predict(features_test)
print(label_pred_LogReg)

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 1 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0 1 2 2 1 2 1 2 1 1 2 1 1 2 1 2 1 0 2 1 1 1 1 2 0 0 2 1 0 0
 1]


# 3つの予測モデルを比較する

In [20]:
#LinearSVMによる予測の混同行列、正解率を表示
from sklearn.metrics import confusion_matrix
print('confusion matrix = \n', confusion_matrix(
    y_true=label_test, y_pred=label_pred_Linsvc))
print('accuracy = ', Linsvc.score(features_test, label_test))


confusion matrix = 
 [[21  0  0]
 [ 0 25  5]
 [ 0  1 23]]
accuracy =  0.92


In [22]:
#K近傍法による分類の混同行列、正解率を表示
from sklearn.metrics import confusion_matrix
print('confusion matrix = \n', confusion_matrix(
    y_true=label_test, y_pred=label_pred_KNeighbor))
print('accuracy = ', Kneighbor.score(features_test, label_test))


confusion matrix = 
 [[21  0  0]
 [ 0 29  1]
 [ 0  2 22]]
accuracy =  0.96


In [23]:
#(参考)ロジスティック回帰による分類の混同行列、正解率を表示
from sklearn.metrics import confusion_matrix
print('confusion matrix = \n', confusion_matrix(
    y_true=label_test, y_pred=label_pred_LogReg))
print('accuracy = ', LogReg.score(features_test, label_test))


confusion matrix = 
 [[21  0  0]
 [ 0 29  1]
 [ 0  4 20]]
accuracy =  0.9333333333333333


# 教師あり学習: 回帰

# 回帰とモデル選択の実装（ボストン住宅価格データセット）
ボストンの住宅価格に関するデータセットを用いて、回帰を行い、住宅価格を予測します。さらに、モデル選択についての手法であるクロスバリデーションも適用します。

In [24]:
#ボストン住宅価格データセットを読み込み
boston = load_boston()

#特徴量をデータフレームに格納して、最初の五行を表示
boston_features = pd.DataFrame(data=boston.data, columns=boston.feature_names)
boston_features.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [25]:
# 特徴量の基本統計量を確認
boston_features.describe()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [26]:
#特徴量の欠損値の数を確認
boston_features.isnull().sum()


CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [27]:
#ターゲットをシリーズに格納
boston_target = pd.Series(data=boston.target)

#ターゲットの基本統計量を確認
boston_target.describe()


count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
dtype: float64

In [28]:
#学習データとテストデータに分割
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(
    boston_features, boston_target, test_size=0.5, random_state=0)


# 回帰のモデル実装
LassoとRidgeRegressionを選んで実装することとします。また、参考に通常の線形回帰モデルでも予測を行います。

In [31]:
from sklearn.linear_model import Lasso
#必要なモジュールをインポート
#Lassoモデルのインスタンスを作成
Lasso = Lasso(alpha=0.1, random_state=0)
#Lassoモデルで学習
Lasso.fit(features_train, target_train)
#学習させたLassoモデルで住宅価格を予測
target_pred_Lasso = Lasso.predict(features_test)
print(target_pred_Lasso)


[24.92016544 24.37622341 28.73960915 12.65498587 20.12822066 20.278054
 20.95011001 21.70953097 19.33986144 19.76993978  5.2619872  15.01342281
 18.55060734  5.14910083 40.88235725 32.82536693 22.02311106 36.97949949
 31.39580121 23.37184357 25.00361915 24.96795955 19.91563456 30.413859
 21.81020632 10.75569666 17.47625276 22.31335423 34.73258421 19.92547932
 17.7296372  19.46532614 19.04038153 22.70199974 28.48412881 21.34836541
 10.64171141 25.65179112 16.72496679 13.66075877 25.616441   19.73714296
 22.02373819 14.67513692 23.37737951 24.41194578 18.21764789 26.58215541
 12.38733835 24.26839401 24.83350403 17.88968114 23.78701489 30.59748917
 14.04863284 21.42971232 20.4370754  14.65041284 14.15350119 20.05664239
 18.44037773 21.76402304 32.97351102 31.87895694 17.65296813 32.92082893
 19.72010474 18.21955219 17.78977064 22.91306219 21.3313637  23.19983381
 31.19722097 30.26343896 25.65488982  4.40093688 36.7227301  23.38754183
 26.84605697 19.75879492 28.12397051 19.99664244 19.833

In [33]:

from sklearn.linear_model import Ridge
    #必要なモジュールをインポート

    #Ridgeモデルのインスタンスを作成
Ridge = Ridge(alpha=0.5, random_state=0)

#Ridgeモデルで学習
Ridge.fit(features_train, target_train)

#Ridgeモデルで住宅価格を予測
target_pred_Ridge = Ridge.predict(features_test)
print(target_pred_Ridge)


[24.80360943 24.73539403 30.39780122 12.40553362 20.79803971 19.40388955
 21.18214301 20.99565887 19.02426418 19.99879681  5.36368389 16.21808903
 17.93509271  5.05178706 42.65517908 32.94679998 22.78240002 37.37457445
 31.20174003 23.01044247 24.69486913 25.35061009 20.08583937 30.62107128
 21.81051138 11.07453103 17.03360027 20.36448845 35.19824057 20.31916718
 18.27284982 18.76847174 19.03135313 23.14224138 28.75382621 21.69617143
 10.68363849 26.98612597 17.74210481 14.72099602 26.10056456 19.84916569
 21.91301663 15.41364384 22.76364783 24.53280879 18.95001075 26.00977557
 10.66191027 24.22559263 24.34162346 17.26895079 23.97355545 30.04081602
 13.86711888 21.24051043 20.35188734 14.30903014 14.90674153 21.97960649
 17.58350061 21.40501834 33.26780334 31.69113278 18.02957703 33.33149629
 19.33748019 18.37075237 18.88335454 22.80962814 22.29273219 23.82481841
 31.36615247 29.65358093 26.46411679  4.37177305 36.91933075 23.43689537
 26.88842557 19.28277885 28.38138273 19.80757983 19

In [34]:
#（参考）
#必要なモジュールをインポート
from sklearn.linear_model import LinearRegression

#線形モデルのインスタンスを作成
LinReg = LinearRegression()

#線形モデルで学習
LinReg.fit(features_train, target_train)

#線形モデルで住宅価格を予測
target_pred_LinReg = LinReg.predict(features_test)
print(target_pred_LinReg)


[24.70960325 25.12631354 30.76503263 12.41405512 21.29599499 18.9820552
 21.41132996 20.67952823 19.00064129 20.20283837  5.56400169 16.92323348
 17.51322896  5.16858442 42.95215092 32.9806573  23.36140299 37.62826682
 31.09206869 22.89283336 24.57289742 25.81536634 20.26467062 30.79131551
 21.95868456 11.79141163 16.88663056 19.19534708 35.40278965 20.65291048
 18.68119761 18.36807058 19.23453044 23.58815977 29.06218826 21.12413253
 10.68515822 27.15483771 18.39065879 15.3858284  26.49125897 19.99603848
 21.88280039 16.08194711 22.40800324 24.76641999 19.58660105 24.79000679
  9.59564604 24.21884557 23.12098756 16.80153308 24.24031563 29.42927442
 13.851603   21.15932546 20.39800177 14.21419815 15.6911474  22.4954085
 17.15556137 21.23238059 33.42098655 31.5832431  18.3631319  33.63873423
 19.03143924 18.53466863 19.51152135 22.78181836 23.02677155 24.29954349
 31.56802819 29.23584248 27.05755582  4.38763735 36.91142859 23.52551148
 26.99615168 19.06117058 28.61936286 19.70539962 20.2

# 決定係数を求める
3つのモデルで予測ができたところで、モデルの性能を評価するために評価指標を計算します。ここでは、決定係数を用いることとします。なお、決定係数とは、簡単に言えば「実際の値のうち予測モデルで説明できた割合」を表す評価指標で０〜１の範囲で変動し、大きければ大きいほどモデルの性能がいいということになります。

In [36]:
#Lassoモデルによる回帰の評価(決定係数の表示)
print("R-squared : ",Lasso.score(features_test,target_test))

R-squared :  0.6537204778207003


In [38]:
#Ridgeモデルによる回帰の評価(決定係数の表示)
print("R-squared : ",Ridge.score(features_test,target_test))

R-squared :  0.6625759807022442


In [39]:
#（参考）線形モデルによる回帰の評価(決定係数の表示)
print("R-squared : ", LinReg.score(features_test, target_test))


R-squared :  0.6662719929919427


# モデルの評価と選択：クロスバリデーション（交差検証）　の実装



In [40]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

#データセットをランダムに５分割するための変数cvを定義
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

#cvを用いてクロスバリデーションを実行
scores = cross_val_score(Lasso, boston_features, boston_target, cv=cv)

#結果を表示
print(scores)
print("R-squared_Average　: {０:.2f}".format(scores.mean()))


[0.55697477 0.65577592 0.59410602 0.78246293 0.78063311]
R-squared_Average　: 0.67


In [41]:
#cvを用いてクロスバリデーションを実行
scores = cross_val_score(Ridge, boston_features, boston_target, cv=cv)

#結果を表示
print(scores)
print("R-squared_Average　: {０:.2f}".format(scores.mean()))


[0.58364982 0.67498977 0.62322026 0.79457322 0.77713899]
R-squared_Average　: 0.69


In [42]:
#cvを用いてクロスバリデーションを実行
scores = cross_val_score(LinReg, boston_features, boston_target, cv=cv)

#結果を表示
print(scores)
print("R-squared_Average　: {０:.2f}".format(scores.mean()))


[0.58922238 0.67790515 0.62474713 0.79455314 0.7751921 ]
R-squared_Average　: 0.69
