### 機能要件
- 2値の分類（Classification）タスクを扱える◯
- カテゴリカル変数を指定するとone-hotエンコードを実行する ◯
- モデル用データマートに施したのと同一データ前処理をスコア用データマートに対しても適用される◯
- モデル選択の評価指標を選択できる◯
- 複数アルゴリズムから指定の評価指標に従いベストモデルを選択できる◯
- 学習済みモデルを保存できる◯
- アルゴリズムランキングと性能評価指標が出力される◯
- 学習済みモデル（保存したモデル）を呼び出しスコア用データに対し予測確率を付与できる◯

## データの読み込み

- ### スコア用データが組み込まれたcsvファイル

In [1]:
# csvには読み込むcsvファイルのパスを指定して下さい
def read_data_contain_score(csv='./data/av_loan_u6lujuX_CVtuZ9i.csv'):
    import pandas as pd
    # csvファイルから読み込み
    df = pd.read_csv(csv, header=0)
    return df
    
df = read_data_contain_score()

- ###  学習用データとスコア用データで分割して渡される場合

In [2]:
# csvには読み込むcsvファイルのパスを指定して下さい
def read_data_not_contain_score(csv='./data/av_loan_u6lujuX_CVtuZ9i.csv', csv_test='./data/av_loan_test_Y3wMUE5_7gLdaTN.csv'):
    import pandas as pd
    
    # csvファイルから読み込み
    df = pd.read_csv(csv, header=0)
    # テストデータを型を訓練データに統一して読み込み
    df_score = pd.read_csv(csv_test, header=0, dtype=dict(df.dtypes))
    
    return df, df_score
    
df, df_score = read_data_not_contain_score()

- ### データの確認

In [3]:
def verify_data(df, df_score):
    import pandas as pd
    print('学習用データ')
    display(df.head())
    print('スコア用データ')
    display(df_score.head())
    # 欠損値データの合計数を確認
    display(pd.concat([df.isnull().sum(), df_score.isnull().sum()], axis=1, keys=['学習用データ','スコア用データ'], sort=False))
    # データ型確認
    print(f'shapeの確認:{df.shape},{df_score.shape}')
    print('データ型確認:')
    # concatで横に結合
    display(pd.concat([df.dtypes, df_score.dtypes], axis=1, keys=['学習用データ','スコア用データ'], sort=False))
    return df, df_score
    
df, df_score = verify_data(df, df_score)

学習用データ


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


スコア用データ


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban


Unnamed: 0,学習用データ,スコア用データ
Loan_ID,0,0.0
Gender,13,11.0
Married,3,0.0
Dependents,15,15.0
Education,0,0.0
Self_Employed,32,20.0
ApplicantIncome,0,0.0
CoapplicantIncome,0,0.0
LoanAmount,22,5.0
Loan_Amount_Term,14,5.0


shapeの確認:(614, 13),(333, 12)
データ型確認:


Unnamed: 0,学習用データ,スコア用データ
Loan_ID,object,object
Gender,object,object
Married,object,object
Dependents,object,object
Education,object,object
Self_Employed,object,object
ApplicantIncome,int64,int64
CoapplicantIncome,float64,float64
LoanAmount,float64,float64
Loan_Amount_Term,float64,float64


- ### 不要なデータを削除

In [4]:
df = df.drop('Loan_ID', axis=1)
df_score = df_score.drop('Loan_ID', axis=1)

- ### 特徴量Xと正解データyに分割

In [5]:
import pandas as pd
def split_into_Xy(df, y_column='Loan_Status'):
    
    import pandas as pd
    # y_column以外を特徴量Xとする
    X = df.drop(y_column, axis=1)
    display(X.head())
    # y_columnを正解データyとする
    y = df.drop(X.columns, axis=1)
    display(y.head())
    return X, y

X, y = split_into_Xy(df, 'Loan_Status')

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


Unnamed: 0,Loan_Status
0,Y
1,N
2,Y
3,Y
4,Y


## 前処理

- ### 正解カラムを数値化

In [6]:
def binarization(y, column='Loan_Status', zero='Y', one='N'):
    
    import pandas as pd
    # dictで変換法則を指定
    class_mapping = {zero:0, one:1}
    # map関数はSeriesに対して効果があるためilocでSeries化して適応する
    y[column] = y[column].map(class_mapping)
    display(y.head())
    return y

y = binarization(y,'Loan_Status', 'Y', 'N')

Unnamed: 0,Loan_Status
0,0
1,1
2,0
3,0
4,0


- ### one-hotエンコーディング

In [7]:
# ダミー化の自動化
def dummy(X, column_dtypes='object'):
    
    columns = X.select_dtypes(include=[column_dtypes]).columns
    print(columns)
    
    # カテゴリカルデータへ変換
    for column in columns:
        # nanを除いた重複のないデータを抽出している
        print(column),
        print(X[column].value_counts().index.tolist())
        # カテゴリカルデータ化
        X[column] = pd.Categorical(X[column], categories=X[column].value_counts().index.tolist(), ordered=False)
        
    display(X.dtypes)
    
    # get_dummies()関数を利用してダミー化,dummy_na=Trueの引数を与える。
    X = pd.get_dummies(X, columns=columns, dummy_na=True, drop_first=True)
    display(X.head())
    display(X.shape)
    display(X.dtypes)
    
    return X

X = dummy(X, 'object')
X_score = dummy(df_score, 'object')

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')
Gender
['Male', 'Female']
Married
['Yes', 'No']
Dependents
['0', '1', '2', '3+']
Education
['Graduate', 'Not Graduate']
Self_Employed
['No', 'Yes']
Property_Area
['Semiurban', 'Urban', 'Rural']


Gender               category
Married              category
Dependents           category
Education            category
Self_Employed        category
ApplicantIncome         int64
CoapplicantIncome     float64
LoanAmount            float64
Loan_Amount_Term      float64
Credit_History        float64
Property_Area        category
dtype: object

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_nan,Married_No,Married_nan,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Urban,Property_Area_Rural,Property_Area_nan
0,5849,0.0,,360.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,4583,1508.0,128.0,360.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
3,2583,2358.0,120.0,360.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
4,6000,0.0,141.0,360.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0


(614, 20)

ApplicantIncome             int64
CoapplicantIncome         float64
LoanAmount                float64
Loan_Amount_Term          float64
Credit_History            float64
Gender_Female               uint8
Gender_nan                  uint8
Married_No                  uint8
Married_nan                 uint8
Dependents_1                uint8
Dependents_2                uint8
Dependents_3+               uint8
Dependents_nan              uint8
Education_Not Graduate      uint8
Education_nan               uint8
Self_Employed_Yes           uint8
Self_Employed_nan           uint8
Property_Area_Urban         uint8
Property_Area_Rural         uint8
Property_Area_nan           uint8
dtype: object

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')
Gender
['Male', 'Female', 'Unknown']
Married
['Yes', 'No']
Dependents
['0', '2', '1']
Education
['Graduate', 'Not Graduate']
Self_Employed
['No', 'Yes']
Property_Area
['Urban', 'Rural', 'Semiurban']


Gender               category
Married              category
Dependents           category
Education            category
Self_Employed        category
ApplicantIncome         int64
CoapplicantIncome     float64
LoanAmount            float64
Loan_Amount_Term      float64
Credit_History        float64
Property_Area        category
dtype: object

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Unknown,Gender_nan,Married_No,Married_nan,Dependents_2,Dependents_1,Dependents_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_nan
0,5720,0.0,110.0,360.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3076,1500.0,126.0,360.0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,5000,1800.0,208.0,360.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,2340,2546.0,100.0,360.0,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,3276,0.0,78.0,360.0,1.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


(333, 20)

ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Gender_Female                uint8
Gender_Unknown               uint8
Gender_nan                   uint8
Married_No                   uint8
Married_nan                  uint8
Dependents_2                 uint8
Dependents_1                 uint8
Dependents_nan               uint8
Education_Not Graduate       uint8
Education_nan                uint8
Self_Employed_Yes            uint8
Self_Employed_nan            uint8
Property_Area_Rural          uint8
Property_Area_Semiurban      uint8
Property_Area_nan            uint8
dtype: object

- ### 数値データの欠損値を補完

In [8]:
def impute(X, strategy='mean'):
    
    import pandas as pd
    from sklearn.preprocessing import Imputer
    # imputerを使用し数値欠損値を補完する
    imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
    imp.fit(X)
    X = pd.DataFrame(imp.transform(X), columns=X.columns.values)
    display(X.head())
    return X
    
X = impute(X, 'mean')
X_score = impute(X_score, 'mean')



Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_nan,Married_No,Married_nan,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Urban,Property_Area_Rural,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0




Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Unknown,Gender_nan,Married_No,Married_nan,Dependents_2,Dependents_1,Dependents_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.831715,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


- ### 学習用データとスコア用データの不整合を解消する

In [9]:
def mismatch_resolution(X, X_score):

    # Pythonの集合型変数を利用する
    X_columns = set(X.columns.values)
    X_score_columns = set(X_score.columns.values)

    # 学習用にはあってスコアには無いデータ項目
    diff1 = X_columns - X_score_columns
    print('学習用のみのカラム:', diff1)

    # スコアにはあって学習用には無いデータ項目
    diff2 = X_score_columns - X_columns
    print('Scoreのみのカラム:',diff2)

    # カラム名のみのdfを作成する
    X_columns_only = pd.DataFrame(None, columns=X_columns, dtype=float)
    display(X_columns_only)

    # 上記カラムにスコア用データをconcatする、該当しないカラムは欠損状態で結合される
    X_score_columns2 = pd.concat([X_columns_only, X_score], sort=False)
    print(X_score_columns2.shape)
    display(X_score_columns2.head())

    # スコアリングに登場したGender_Unknownを削除する
    X_score_columns3 = X_score_columns2.drop(list(diff2), axis=1)
    print(X_score_columns3.shape)
#     display(X_score_columns3.head())

    # モデルに登場したDepwndents_3+のNaNをゼロ埋めする
    X_score_columns3.loc[:,list(diff1)] = X_score_columns3.loc[:,list(diff1)].fillna(0, axis=1)
#     display(X_score_columns3.head())

    # モデルでfitさせたRFEをスコアで使用する為に、順序をモデルと統一する
    X_score_columns3 = X_score_columns3.reindex(X.columns.values, axis=1)
    
    display(X_score_columns3.head())
    
    return X_score_columns3

X_score = mismatch_resolution(X, X_score)

学習用のみのカラム: {'Dependents_3+', 'Property_Area_Urban'}
Scoreのみのカラム: {'Property_Area_Semiurban', 'Gender_Unknown'}


Unnamed: 0,Married_No,Dependents_nan,Dependents_3+,Gender_nan,Self_Employed_nan,Property_Area_Rural,Education_Not Graduate,LoanAmount,Property_Area_nan,CoapplicantIncome,Property_Area_Urban,Married_nan,Dependents_2,Self_Employed_Yes,Loan_Amount_Term,ApplicantIncome,Education_nan,Dependents_1,Credit_History,Gender_Female


(333, 22)


Unnamed: 0,Married_No,Dependents_nan,Dependents_3+,Gender_nan,Self_Employed_nan,Property_Area_Rural,Education_Not Graduate,LoanAmount,Property_Area_nan,CoapplicantIncome,...,Dependents_2,Self_Employed_Yes,Loan_Amount_Term,ApplicantIncome,Education_nan,Dependents_1,Credit_History,Gender_Female,Gender_Unknown,Property_Area_Semiurban
0,0.0,0.0,,0.0,0.0,0.0,0.0,110.0,0.0,0.0,...,0.0,0.0,360.0,5720.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,,0.0,0.0,0.0,0.0,126.0,0.0,1500.0,...,0.0,0.0,360.0,3076.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,208.0,0.0,1800.0,...,1.0,0.0,360.0,5000.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,,0.0,0.0,0.0,0.0,100.0,0.0,2546.0,...,1.0,0.0,360.0,2340.0,0.0,0.0,0.831715,0.0,0.0,0.0
4,1.0,0.0,,0.0,0.0,0.0,1.0,78.0,0.0,0.0,...,0.0,0.0,360.0,3276.0,0.0,0.0,1.0,0.0,0.0,0.0


(333, 20)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_nan,Married_No,Married_nan,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Urban,Property_Area_Rural,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.831715,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## 学習

- ### Hyperoptの動作確認用データ準備

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()

print('dataset.keys():',dataset.keys())

X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name='y')

# 中身の確認
print('------------------------------')
print(f'X shape:{X.shape}')
print('------------------------------')
display(X.join(y).head())

# ホールドアウト
X, X_score, y, y_score = train_test_split(X, y, test_size=0.20, random_state=1)
print('X_shape:', X.shape)
print('y_shape:', y.shape)
print('X_score_shape:', X_score.shape)
print('y_score_shape:', y_score.shape)

dataset.keys(): dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
------------------------------
X shape:(569, 30)
------------------------------


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


X_shape: (455, 30)
y_shape: (455,)
X_score_shape: (114, 30)
y_score_shape: (114,)


- ### Hyperopt構築

便利な関数定義

In [11]:
# 最適化履歴を詳細に抽出する関数
def f_wrap_space_eval(hp_space, trial):
    return space_eval(hp_space, {k: v[0] for (k, v) in trial['misc']['vals'].items() if len(v) > 0})


# dict型を展開してくれる関数定義
def f_unpack_dict(dct):
    res = {}
    for (k, v) in dct.items():
        if isinstance(v, dict):
            res = {**res, **f_unpack_dict(v)}
        else:
            res[k] = v
            
    return res

必要ライブラリのインポート

In [12]:
# 計算
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, f1_score

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
from functools import partial
# モデル
from sklearn.neighbors import KNeighborsClassifier#K近傍法
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier #決定木
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #ランダムフォレストと勾配ブースティング

Hyperoptに使用するランダムなパラメータの組み合わせ定義

In [13]:
# 複数モデルとハイパーパラメータをいっぺんに比較したい場合の指定方法
hp_space_clf = {
    # hp.choiceでランダムに選択
    'clf_type': hp.choice('clf_type', [
#       k近傍法、距離の近いkの数を1〜50の整数値に設定
        {'type': 'knn', 'reduct': {'n_components': ho_scope.int(hp.quniform('reduct.n_components', low=5, high=20, q=1))},
                        'clf': {'n_neighbors': ho_scope.int(hp.quniform('knn.n_neighbors', low=1, high=50, q=1))}},
#       サポートベクターマシン、ソフトマージン分類のペナルティCの数を1.0〜25.0のfloatに設定、カーネルトリックの方法は2種類試す
        {'type': 'rsvc', 'reduct': {'n_components': ho_scope.int(hp.quniform('rsvc.n_components', low=5, high=20, q=1))},
                         'clf': {'C': hp.uniform('rsvc.C', low=0.1, high=25.0), 
                                 'class_weight': hp.choice('rsvc.class_weight', [None, 'balanced']), 
                                 'gamma': hp.choice('rsvc.gamma', ['auto', 'scale']), 
                                 'kernel': hp.choice('rsvc.kernel', ['rbf', 'poly'])}},
#       線形サポートベクターマシン、ソフトマージン分類のペナルティCの数を1.0〜25.0のfloatに設定
        {'type': 'lsvc', 'reduct': {'n_components': ho_scope.int(hp.quniform('lsvc.n_components', low=5, high=20, q=1))},
                         'clf': {'C': hp.uniform('lsvc.C', low=1.0, high=25.0), 
                                 'class_weight': hp.choice('lsvc.class_weight', [None, 'balanced']),
                                 'gamma': hp.choice('lsvc.gamma', ['auto', 'scale'])}},
#       決定木、不純度の計算方法2種類で設定
        {'type': 'tree', 'reduct': {'n_components': ho_scope.int(hp.quniform('tree.n_components', low=5, high=20, q=1))},
                         'clf': {'criterion': hp.choice('tree.criterion', ['gini', 'entropy'])}},
#       ランダムフォレスト、不純度の計算方法2種類を試す、復元抽出本数を100〜300本25本飛ばしで設定
        {'type': 'rf', 'reduct': {'n_components': ho_scope.int(hp.quniform('rf.n_components', low=5, high=20, q=1))},
                       'clf': {'n_estimators': ho_scope.int(hp.quniform('rf.n_estimators', low=100, high=500, q=10)), 
                               'criterion': hp.choice('rf.criterion', ['gini', 'entropy'])}},
#       勾配ブースティング、学習回数を100〜300回25回飛ばし、ロスを二種類、各ツリーの影響度を0.01〜0.5で設定
        {'type': 'gbr', 'reduct': {'n_components': ho_scope.int(hp.quniform('gbr.n_components', low=5, high=20, q=1))},
                        'clf': {'loss': hp.choice('gbr.loss', ['deviance', 'exponential']),
                                'n_estimators': ho_scope.int(hp.quniform('gbr.n_estimators', low=100, high=500, q=10)),
                                'learning_rate': hp.uniform('gbr.learning_rate', low=0.01, high=10.0)}}])} 
ho_sample(hp_space_clf)

{'clf_type': {'clf': {'n_neighbors': 7},
  'reduct': {'n_components': 12},
  'type': 'knn'}}

hp_space_clfで指定されたパラメータでfitする為のPipelineを設定

In [14]:
# 探索スペースから生成される組み合わせで条件分岐を用いて各モデルを分割し、パイプラインでfitさせていく
def f_clf(hps):
    # knnの場合
    if hps['clf_type']['type'] == 'knn':
        model = Pipeline([
            ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
            ('clf', KNeighborsClassifier(**f_unpack_dict(hps['clf_type']['clf'])))])
    # サポートベクターマシンの場合
    elif hps['clf_type']['type'] == 'rsvc':
        model = Pipeline([
            ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
            ('clf', SVC(**f_unpack_dict(hps['clf_type']['clf']), random_state=1, probability=True))])
    # 線形サポートベクターマシンの場合
    elif hps['clf_type']['type'] == 'lsvc':
        model = Pipeline([
            ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
#           LinearSVC()はpredict_probaが使用できないため, SVC()にkernel='linear',probability=Trueを適用することで代用する
            ('clf', SVC(**f_unpack_dict(hps['clf_type']['clf']), random_state=1, kernel='linear',probability=True))])
    # 決定木の場合
    elif hps['clf_type']['type'] == 'tree':
        model = Pipeline([
#             ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
            ('clf', DecisionTreeClassifier(**f_unpack_dict(hps['clf_type']['clf']), random_state=1))])
    # ランダムフォレスト
    elif hps['clf_type']['type'] == 'rf':
        model = Pipeline([
#             ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
            ('clf', RandomForestClassifier(**f_unpack_dict(hps['clf_type']['clf']), max_depth=5, random_state=1))])
    # 勾配ブースティング
    elif hps['clf_type']['type'] == 'gbr':
        model = Pipeline([
#             ('scale', StandardScaler()),
            ('reduct', PCA(**hps['clf_type']['reduct'], random_state=1)),
            ('clf', GradientBoostingClassifier(**f_unpack_dict(hps['clf_type']['clf']), random_state=1))])
    else:
        raise KeyError('Unknown classifier type hyperparameter value: {0}'.format(hps['clf_type']['type']))
    
    return model

最小化するスコアを設定

In [15]:
# 最小化する関数を定義する
def f_to_min(hps, X, y, ncv=5):
    
#   https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
#   scoringの指標は上記から選ぶ
    scoring = 'roc_auc'
    
    model = f_clf(hps)
    cv_res = cross_val_score(model, X, y, cv=StratifiedKFold(ncv, random_state=1), scoring=scoring, n_jobs=-1)
    
    # 今回はdict型で結果を返すようにする
    return {
        'loss': -cv_res.mean(), 
        # 標準偏差を追加することで一貫性をもたせる
        'cv_std': cv_res.std(), 
        'status': STATUS_OK
    }

探索実行

In [16]:
# 最適化の履歴を記録
trials_clf = Trials()
# 最適化
# 500回実行する
best_clf = fmin(partial(f_to_min, X=X, y=y), 
                 hp_space_clf, algo=tpe.suggest, max_evals=500, 
                 trials=trials_clf, rstate=np.random.RandomState(1))

100%|██████████| 500/500 [01:22<00:00,  6.07it/s, best loss: -0.9955624355005159]


ベストスコア、スコア順位確認

In [17]:
# 正解データで検証
# ベストスコアのパラメータを確認
print('Best parameters:')
print(space_eval(hp_space_clf, best_clf))
# 最良のパラメータでモデルを構築
clf = f_clf(space_eval(hp_space_clf, best_clf)).fit(X, y)

# trials_clfからfor文を使用して履歴を取り出す
clf_dict = []
for num, item in enumerate(trials_clf.trials):
    vals = item['misc']['vals']
    result = item['result']
    model_list = ['knn', 'rsvc', 'lsvc', 'tree', 'rf', 'gbr']
    clf_dict.append({'model_name': model_list[vals['clf_type'][0]]})
    clf_dict[num].update(result)
#     print(clf_dict[num])

# lossの低い順に並び替え
scores_sorted = np.array(sorted(clf_dict, key=lambda x:x['loss']))
print(scores_sorted[:10])

Best parameters:
{'clf_type': {'clf': {'C': 2.7860086041040955, 'class_weight': 'balanced', 'gamma': 'auto'}, 'reduct': {'n_components': 12}, 'type': 'lsvc'}}
[{'model_name': 'lsvc', 'loss': -0.9955624355005159, 'cv_std': 0.005186721103458854, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9954592363261094, 'cv_std': 0.0053663570691434796, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9954592363261094, 'cv_std': 0.004699813224658693, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9954592363261094, 'cv_std': 0.004888635410661328, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9954592363261094, 'cv_std': 0.004699813224658693, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9954592363261094, 'cv_std': 0.004888635410661328, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9953560371517028, 'cv_std': 0.004895166656607422, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': -0.9953560371517028, 'cv_std': 0.004895166656607422, 'status': 'ok'}
 {'model_name': 'lsvc', 'loss': 

- ### スコアデータを用いて評価を行う

最良のモデルを使用しテストデータにフィット、スコアを確認する

In [24]:
# 最良のモデルを使用し、テストデータをで再現する
clf_val_score = roc_auc_score(y_score, clf.predict(X_score))
print('Cross-val score: {0:.5f} +/- {1:.5f}; validation score: {2:.5f}'.\
format(-trials_clf.best_trial['result']['loss'], trials_clf.best_trial['result']['cv_std'], clf_val_score))

Cross-val score: 0.99556 +/- 0.00519; validation score: 0.96925


In [19]:
# 最良のモデルを使用し、テストデータをで再現する
clf_val_score = f1_score(y_score, clf.predict(X_score))
print('Cross-val score: {0:.5f} +/- {1:.5f}; validation score: {2:.5f}'.\
format(-trials_clf.best_trial['result']['loss'], trials_clf.best_trial['result']['cv_std'], clf_val_score))

Cross-val score: 0.99556 +/- 0.00519; validation score: 0.97931


In [20]:
# 最良のモデルを使用し、テストデータをで再現する
clf_val_score = r2_score(y_score, clf.predict(X_score))
print('Cross-val score: {0:.5f} +/- {1:.5f}; validation score: {2:.5f}'.\
format(-trials_clf.best_trial['result']['loss'], trials_clf.best_trial['result']['cv_std'], clf_val_score))

Cross-val score: 0.99556 +/- 0.00519; validation score: 0.88690


- ### ベストモデルの永続化

In [21]:
import pickle

# 永続化したいデータをpickle.dumps()に指定する
s = pickle.dumps(clf)

# 永続化したデータを使用する場合はpickle.loads()で呼び出せる
clf2 = pickle.loads(s)
print(clf2.predict_proba(X_score)[:10])
print(clf2.predict(X_score)[:10])

[[0.16135254 0.83864746]
 [0.95407137 0.04592863]
 [0.01995397 0.98004603]
 [0.98632057 0.01367943]
 [0.93927395 0.06072605]
 [0.99235522 0.00764478]
 [0.9987342  0.0012658 ]
 [0.82231907 0.17768093]
 [0.01384904 0.98615096]
 [0.05522497 0.94477503]]
[1 0 1 0 0 0 0 0 1 1]
