In [1]:
import numpy as np
import pandas as pd
from rdkit import rdBase, Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors
import mordred

In [2]:
df = pd.read_csv('./data/origin.csv',index_col=0)
df

Unnamed: 0,entry,R1-,organocatalyst,organocatalyst(mol%),temp(℃),time(h),yield,R2_C,R2_H,under_O2,under_air,solvent_CH3CN,solvent_MeOH,solvent_toluene
0,1,[*]C1=CC=CC=C1,ClC1=CC=C(C(O)=O)C(O)=C1,5.0,90,24.0,81.0,0,1,1,0,0,0,1
1,2,[*]C1=CC=CC=C1,ClC1=CC=C(C(O)=O)C(O)=C1,5.0,90,12.0,14.0,0,1,1,0,0,0,1
2,3,[*]C1=CC=CC=C1,OC1=CC(C)=CC=C1C(O)=O,5.0,90,24.0,74.0,0,1,1,0,0,0,1
3,4,[*]C1=CC=CC=C1,OC1=CC(C)=CC=C1C(O)=O,5.0,90,12.0,15.0,0,1,1,0,0,0,1
4,5,[*]C1=CC=CC=C1,OC1=CC(OC)=CC=C1C(O)=O,5.0,90,24.0,90.0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2i,[*]C1=CC(Br)=CC=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,72.0,95.0,1,0,1,0,1,0,0
116,2j,[*]C1=CC=CC(OC)=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,48.0,89.0,1,0,1,0,1,0,0
117,2k,[*]C1=CC(C)=C(C)C=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,72.0,99.0,1,0,1,0,1,0,0
118,2m,[*]C1=CSC=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,60,48.0,83.0,1,0,1,0,1,0,0


In [3]:
mols_r1 = list(Chem.MolFromSmiles(smile) for smile in df['R1-'].values)
mols_organocatalyst = list(Chem.MolFromSmiles(smile) for smile in df['organocatalyst'].values)

# Rdkit記述子に変換

In [4]:
def toFingerRdkit(mols,prefix):
    descriptor_names = [(descriptor_name[0] + '_' + prefix) for descriptor_name in Descriptors.descList]
    descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
    rdkit_descriptors_results = [descriptor_calculator.CalcDescriptors(mol) for mol in mols]
    df_rdkit = pd.DataFrame(rdkit_descriptors_results, columns=descriptor_names)
    df_rdkit = df_rdkit[df_rdkit.columns[~df_rdkit.isnull().any()]]
    return df_rdkit

In [5]:
df_rdkit_r1 = toFingerRdkit(mols_r1, 'r1')
df_rdkit_organocatalyst = toFingerRdkit(mols_organocatalyst, 'organocatalyst')

In [6]:
print(df_rdkit_r1.shape)
print(df_rdkit_organocatalyst.shape)

(120, 208)
(120, 208)


# Mordred記述子の変換を行う

In [7]:
def toFingerMordred(mols):
    mordered_calculator = Calculator(descriptors,ignore_3D=True)
    df_mordred = mordered_calculator.pandas(pd.Series(mols))
    
    for column in df_mordred.columns:
        if df_mordred[column].dtypes == object:
            df_mordred[column] = df_mordred[column].values.astype(np.float32)
    
    df_mordered = df_mordred[df_mordred.columns[~df_mordred.isnull().any()]]
    return df_mordered

In [8]:
df_mordred_r1 = toFingerMordred(mols_r1)
df_mordred_organocatalyst = toFingerMordred(mols_organocatalyst)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:19<00:00,  6.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:29<00:00,  4.06it/s]


In [41]:
df_rdkit_r1.to_csv('./data/method4/rdkit_r1.csv')
df_rdkit_organocatalyst.to_csv('./data/method4/rdkit_organocatalyst.csv')
df_mordred_r1.to_csv('./data/method4/mordred_r1.csv')
df_mordred_organocatalyst.to_csv('./data/method4/mordred_organocatalyst.csv')

In [42]:
df_dropped = df.drop(columns=['entry', 'R1-', 'organocatalyst'])
r1_names = ['rdkit_r1','mordred_r1']
organo_names = ['rdkit_organo', 'mordred_organo']

for r1_name, df_r1 in zip(r1_names, [df_rdkit_r1,df_mordred_r1]):
    for organo_name, df_organocatalyst in zip(organo_names, [df_rdkit_organocatalyst, df_mordred_organocatalyst]):
        _df = pd.concat([df_dropped,df_r1, df_organocatalyst], axis=1)
        _df.to_csv(f'./data/method4/dataset_{r1_name}_{organo_name}.csv')


KeyError: "['entry', 'R1-', 'organocatalyst'] not found in axis"

# 特徴量選択
1. 分散0の説明変数を除去
2. 9割以上が同じ値になる記述子を削除
3. 0.95以上の説明変数の組みがある場合どちらかを除去
4. オートスケーリング

In [18]:
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

def selectFeature(df):
    select = VarianceThreshold()
    select_values = select.fit_transform(df.values)
    df_selected = df[df.columns[select.get_support()]]
    
    st_threshold = 0.9
    selected = []
    for column in df_selected:
        value_counts = df_selected[column].value_counts(sort=True)
        # 全体に対して何割同じ値があるか
        score = value_counts.values[0] / len(df_selected[column].values)
        if score < st_threshold:
            selected.append(column)
    df_selected = df_selected[selected]
    
    # 相関が0.95以上のカラムを除去
    df_selected = deleteHighCorrColumn(df_selected)
    
    return df_selected
        

def deleteHighCorrColumn(df):
    threshold = 0.95
    df_corr = df.corr()
    df_corr = abs(df_corr)
    columns = df_corr.columns

    # 対角線の値を0にする
    for i in range(0, len(columns)):
        df_corr.iloc[i, i] = 0

    while True:
        columns = df_corr.columns
        max_corr = 0.0
        query_column = None
        target_column = None

        df_max_column_value = df_corr.max()
        max_corr = df_max_column_value.max()
        query_column = df_max_column_value.idxmax()
        target_column = df_corr[query_column].idxmax()

        if max_corr < threshold:
            # しきい値を超えるものがなかったため終了
            break
        else:
            # しきい値を超えるものがあった場合
            delete_column = None
            saved_column = None

            # その他との相関の絶対値が大きい方を除去
            if sum(df_corr[query_column]) <= sum(df_corr[target_column]):
                delete_column = target_column
                saved_column = query_column
            else:
                delete_column = query_column
                saved_column = target_column

            # 除去すべき特徴を相関行列から消す（行、列）
            df_corr.drop([delete_column], axis=0, inplace=True)
            df_corr.drop([delete_column], axis=1, inplace=True)

    return df[df_corr.columns]


In [43]:
df1 = pd.read_csv('./data/method4/dataset_mordred_r1_mordred_organo.csv',index_col=0)
df2 = pd.read_csv('./data/method4/dataset_rdkit_r1_rdkit_organo.csv',index_col=0)
df3 = pd.read_csv('./data/method4/dataset_rdkit_r1_mordred_organo.csv',index_col=0)
df4 = pd.read_csv('./data/method4/dataset_mordred_r1_rdkit_organo.csv',index_col=0)

In [44]:
selectFeature(df1).to_csv('./data/method4/mordred_r1_mordred_organo.csv')
selectFeature(df2).to_csv('./data/method4/rdkit_r1_rdkit_organo.csv')
selectFeature(df3).to_csv('./data/method4/rdkit_r1_mordred_organo.csv')
selectFeature(df4).to_csv('./data/method4/mordred_r1_rdkit_organo.csv')

In [52]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


def scale(df):
    # scaling
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['yield']), df['yield'], test_size=0.2, random_state=0)
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train.values)
    X_test_scaled = ss.transform(X_test.values)
    return [X_train_scaled,X_test_scaled,y_train,y_test]

import optuna
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
def objective(trial, x, t, cv):
    # 1. ハイパーパラメータごとに探索範囲を指定
    n_estimaters = trial.suggest_int('n_estimators', 1, 100)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    num_leaves = trial.suggest_int('num_leaves', 2, 10)
    min_child_weight = trial.suggest_loguniform("min_child_weight", 0.1, 10)
    subsample = trial.suggest_uniform("subsample",0.55, 0.95)    
    colsample_bytree = trial.suggest_uniform("subsample",0.55, 0.95)    

    # 2. 学習に使用するアルゴリズムを指定
    estimator = LGBMRegressor(
        n_estimators=n_estimaters,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree= colsample_bytree,
        random_state=43
    )

    # 3. 学習の実行、検証結果の表示
    print('Current_params : ', trial.params)
    r2 = cross_val_score(estimator, x, t, cv=cv, scoring="r2").mean()
    return r2

In [53]:
# study オブジェクトの作成（最大化）
study = optuna.create_study(direction='maximize')
# 10分割交差検証によるチューニング
cv = 10
[X_train_scaled,X_test_scaled,y_train,y_test] = scale(df1)
study.optimize(lambda trial: objective(trial, X_train_scaled, y_train, cv), n_trials=50)

[32m[I 2022-12-05 08:29:25,170][0m A new study created in memory with name: no-name-0655b805-3dec-49d2-9aa6-ca3ca4ae4162[0m


Current_params :  {'n_estimators': 13, 'max_depth': 6, 'num_leaves': 10, 'min_child_weight': 2.2223139003619967, 'subsample': 0.6161277965056239}


[32m[I 2022-12-05 08:29:25,788][0m Trial 0 finished with value: -0.13268061833247513 and parameters: {'n_estimators': 13, 'max_depth': 6, 'num_leaves': 10, 'min_child_weight': 2.2223139003619967, 'subsample': 0.6161277965056239}. Best is trial 0 with value: -0.13268061833247513.[0m


Current_params :  {'n_estimators': 76, 'max_depth': 1, 'num_leaves': 10, 'min_child_weight': 0.7259630007704551, 'subsample': 0.6268481590938403}


[32m[I 2022-12-05 08:29:26,391][0m Trial 1 finished with value: -0.1314168738595139 and parameters: {'n_estimators': 76, 'max_depth': 1, 'num_leaves': 10, 'min_child_weight': 0.7259630007704551, 'subsample': 0.6268481590938403}. Best is trial 1 with value: -0.1314168738595139.[0m


Current_params :  {'n_estimators': 41, 'max_depth': 6, 'num_leaves': 6, 'min_child_weight': 6.628995133124165, 'subsample': 0.884115199496866}


[32m[I 2022-12-05 08:29:27,040][0m Trial 2 finished with value: -0.08209443517222217 and parameters: {'n_estimators': 41, 'max_depth': 6, 'num_leaves': 6, 'min_child_weight': 6.628995133124165, 'subsample': 0.884115199496866}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 31, 'max_depth': 1, 'num_leaves': 3, 'min_child_weight': 3.4273791466892836, 'subsample': 0.9078443525832435}


[32m[I 2022-12-05 08:29:27,822][0m Trial 3 finished with value: -0.14464250472149365 and parameters: {'n_estimators': 31, 'max_depth': 1, 'num_leaves': 3, 'min_child_weight': 3.4273791466892836, 'subsample': 0.9078443525832435}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 49, 'max_depth': 4, 'num_leaves': 2, 'min_child_weight': 0.45961421900507454, 'subsample': 0.922322040875989}


[32m[I 2022-12-05 08:29:28,738][0m Trial 4 finished with value: -0.12937447925434073 and parameters: {'n_estimators': 49, 'max_depth': 4, 'num_leaves': 2, 'min_child_weight': 0.45961421900507454, 'subsample': 0.922322040875989}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 13, 'max_depth': 4, 'num_leaves': 3, 'min_child_weight': 2.164114711034151, 'subsample': 0.8749350578431518}


[32m[I 2022-12-05 08:29:29,273][0m Trial 5 finished with value: -0.15849148250042927 and parameters: {'n_estimators': 13, 'max_depth': 4, 'num_leaves': 3, 'min_child_weight': 2.164114711034151, 'subsample': 0.8749350578431518}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 87, 'max_depth': 3, 'num_leaves': 2, 'min_child_weight': 1.9309588586763147, 'subsample': 0.7528255888987714}


[32m[I 2022-12-05 08:29:30,078][0m Trial 6 finished with value: -0.13010766525938933 and parameters: {'n_estimators': 87, 'max_depth': 3, 'num_leaves': 2, 'min_child_weight': 1.9309588586763147, 'subsample': 0.7528255888987714}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 17, 'max_depth': 6, 'num_leaves': 7, 'min_child_weight': 1.6085481831069233, 'subsample': 0.6505111514901745}


[32m[I 2022-12-05 08:29:30,676][0m Trial 7 finished with value: -0.13436132412239404 and parameters: {'n_estimators': 17, 'max_depth': 6, 'num_leaves': 7, 'min_child_weight': 1.6085481831069233, 'subsample': 0.6505111514901745}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 21, 'max_depth': 5, 'num_leaves': 7, 'min_child_weight': 0.1503791837963412, 'subsample': 0.7027201521837856}


[32m[I 2022-12-05 08:29:31,351][0m Trial 8 finished with value: -0.10361013622349871 and parameters: {'n_estimators': 21, 'max_depth': 5, 'num_leaves': 7, 'min_child_weight': 0.1503791837963412, 'subsample': 0.7027201521837856}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 77, 'max_depth': 3, 'num_leaves': 8, 'min_child_weight': 0.8531963244346612, 'subsample': 0.6287344980374171}


[32m[I 2022-12-05 08:29:32,406][0m Trial 9 finished with value: -0.176991132722046 and parameters: {'n_estimators': 77, 'max_depth': 3, 'num_leaves': 8, 'min_child_weight': 0.8531963244346612, 'subsample': 0.6287344980374171}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 54, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 8.976260342011727, 'subsample': 0.808590025834832}


[32m[I 2022-12-05 08:29:33,505][0m Trial 10 finished with value: -0.13196866130727739 and parameters: {'n_estimators': 54, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 8.976260342011727, 'subsample': 0.808590025834832}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 40, 'max_depth': 8, 'num_leaves': 5, 'min_child_weight': 0.1361497811200107, 'subsample': 0.7214942254449304}


[32m[I 2022-12-05 08:29:34,241][0m Trial 11 finished with value: -0.0825369511696596 and parameters: {'n_estimators': 40, 'max_depth': 8, 'num_leaves': 5, 'min_child_weight': 0.1361497811200107, 'subsample': 0.7214942254449304}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 43, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 0.10206342978039998, 'subsample': 0.8229472085263971}


[32m[I 2022-12-05 08:29:34,983][0m Trial 12 finished with value: -0.12294070562351707 and parameters: {'n_estimators': 43, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 0.10206342978039998, 'subsample': 0.8229472085263971}. Best is trial 2 with value: -0.08209443517222217.[0m


Current_params :  {'n_estimators': 36, 'max_depth': 8, 'num_leaves': 5, 'min_child_weight': 0.23713341675518004, 'subsample': 0.7256323946838023}


[32m[I 2022-12-05 08:29:35,648][0m Trial 13 finished with value: -0.0797730075891511 and parameters: {'n_estimators': 36, 'max_depth': 8, 'num_leaves': 5, 'min_child_weight': 0.23713341675518004, 'subsample': 0.7256323946838023}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 62, 'max_depth': 7, 'num_leaves': 6, 'min_child_weight': 0.32203581575342005, 'subsample': 0.5608103427995922}


[32m[I 2022-12-05 08:29:36,568][0m Trial 14 finished with value: -0.16805798580542475 and parameters: {'n_estimators': 62, 'max_depth': 7, 'num_leaves': 6, 'min_child_weight': 0.32203581575342005, 'subsample': 0.5608103427995922}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 3, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 9.738053857060404, 'subsample': 0.7967207736905482}


[32m[I 2022-12-05 08:29:37,259][0m Trial 15 finished with value: -0.43308926869175524 and parameters: {'n_estimators': 3, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 9.738053857060404, 'subsample': 0.7967207736905482}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 31, 'max_depth': 7, 'num_leaves': 7, 'min_child_weight': 0.2879686986088663, 'subsample': 0.8599400957395895}


[32m[I 2022-12-05 08:29:38,751][0m Trial 16 finished with value: -0.09069311444986457 and parameters: {'n_estimators': 31, 'max_depth': 7, 'num_leaves': 7, 'min_child_weight': 0.2879686986088663, 'subsample': 0.8599400957395895}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 62, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 4.199549958822171, 'subsample': 0.7707330774867037}


[32m[I 2022-12-05 08:29:40,185][0m Trial 17 finished with value: -0.1500198439328095 and parameters: {'n_estimators': 62, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 4.199549958822171, 'subsample': 0.7707330774867037}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 31, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 5.161391268941661, 'subsample': 0.6856347596088876}


[32m[I 2022-12-05 08:29:40,957][0m Trial 18 finished with value: -0.0806523532878326 and parameters: {'n_estimators': 31, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 5.161391268941661, 'subsample': 0.6856347596088876}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 29, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 0.513720448950051, 'subsample': 0.6837728549431834}


[32m[I 2022-12-05 08:29:41,689][0m Trial 19 finished with value: -0.09148902517911861 and parameters: {'n_estimators': 29, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 0.513720448950051, 'subsample': 0.6837728549431834}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 98, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 1.2003793175489559, 'subsample': 0.5622266251788491}


[32m[I 2022-12-05 08:29:43,077][0m Trial 20 finished with value: -0.18375553849649373 and parameters: {'n_estimators': 98, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 1.2003793175489559, 'subsample': 0.5622266251788491}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 40, 'max_depth': 7, 'num_leaves': 6, 'min_child_weight': 5.459290341166786, 'subsample': 0.7295876954969629}


[32m[I 2022-12-05 08:29:43,891][0m Trial 21 finished with value: -0.09639847853620712 and parameters: {'n_estimators': 40, 'max_depth': 7, 'num_leaves': 6, 'min_child_weight': 5.459290341166786, 'subsample': 0.7295876954969629}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 31, 'max_depth': 8, 'num_leaves': 6, 'min_child_weight': 5.6448044062201985, 'subsample': 0.6717743284580773}


[32m[I 2022-12-05 08:29:44,502][0m Trial 22 finished with value: -0.11387448255905212 and parameters: {'n_estimators': 31, 'max_depth': 8, 'num_leaves': 6, 'min_child_weight': 5.6448044062201985, 'subsample': 0.6717743284580773}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 55, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 6.9491668293287265, 'subsample': 0.9450082728087235}


[32m[I 2022-12-05 08:29:45,527][0m Trial 23 finished with value: -0.11619918744207754 and parameters: {'n_estimators': 55, 'max_depth': 10, 'num_leaves': 4, 'min_child_weight': 6.9491668293287265, 'subsample': 0.9450082728087235}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 24, 'max_depth': 9, 'num_leaves': 6, 'min_child_weight': 3.3112255911545514, 'subsample': 0.775676562040901}


[32m[I 2022-12-05 08:29:46,260][0m Trial 24 finished with value: -0.09495564979331281 and parameters: {'n_estimators': 24, 'max_depth': 9, 'num_leaves': 6, 'min_child_weight': 3.3112255911545514, 'subsample': 0.775676562040901}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 45, 'max_depth': 5, 'num_leaves': 5, 'min_child_weight': 0.216266785285344, 'subsample': 0.8376468054020726}


[32m[I 2022-12-05 08:29:47,039][0m Trial 25 finished with value: -0.12083658562624625 and parameters: {'n_estimators': 45, 'max_depth': 5, 'num_leaves': 5, 'min_child_weight': 0.216266785285344, 'subsample': 0.8376468054020726}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 35, 'max_depth': 8, 'num_leaves': 7, 'min_child_weight': 1.1838317565851835, 'subsample': 0.7283242037176905}


[32m[I 2022-12-05 08:29:47,679][0m Trial 26 finished with value: -0.08129183321343682 and parameters: {'n_estimators': 35, 'max_depth': 8, 'num_leaves': 7, 'min_child_weight': 1.1838317565851835, 'subsample': 0.7283242037176905}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 2, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 0.6254474204867901, 'subsample': 0.7318166884758787}


[32m[I 2022-12-05 08:29:48,155][0m Trial 27 finished with value: -0.5332215947093375 and parameters: {'n_estimators': 2, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 0.6254474204867901, 'subsample': 0.7318166884758787}. Best is trial 13 with value: -0.0797730075891511.[0m


Current_params :  {'n_estimators': 34, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 1.1647222787859262, 'subsample': 0.6969301783904213}


[32m[I 2022-12-05 08:29:48,982][0m Trial 28 finished with value: -0.07416216404075347 and parameters: {'n_estimators': 34, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 1.1647222787859262, 'subsample': 0.6969301783904213}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 9, 'max_depth': 10, 'num_leaves': 9, 'min_child_weight': 0.37056020737919326, 'subsample': 0.6635832365033287}


[32m[I 2022-12-05 08:29:49,638][0m Trial 29 finished with value: -0.17220548392786195 and parameters: {'n_estimators': 9, 'max_depth': 10, 'num_leaves': 9, 'min_child_weight': 0.37056020737919326, 'subsample': 0.6635832365033287}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 23, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 2.871584453965961, 'subsample': 0.6025385158342295}


[32m[I 2022-12-05 08:29:50,283][0m Trial 30 finished with value: -0.1380442332861475 and parameters: {'n_estimators': 23, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 2.871584453965961, 'subsample': 0.6025385158342295}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 36, 'max_depth': 9, 'num_leaves': 7, 'min_child_weight': 1.0921067905624047, 'subsample': 0.6936425116770533}


[32m[I 2022-12-05 08:29:51,320][0m Trial 31 finished with value: -0.07897990201184191 and parameters: {'n_estimators': 36, 'max_depth': 9, 'num_leaves': 7, 'min_child_weight': 1.0921067905624047, 'subsample': 0.6936425116770533}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 36, 'max_depth': 9, 'num_leaves': 8, 'min_child_weight': 1.4679901109157771, 'subsample': 0.6958294616627694}


[32m[I 2022-12-05 08:29:52,184][0m Trial 32 finished with value: -0.07765763635005755 and parameters: {'n_estimators': 36, 'max_depth': 9, 'num_leaves': 8, 'min_child_weight': 1.4679901109157771, 'subsample': 0.6958294616627694}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 50, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 1.3748010709911984, 'subsample': 0.7034124675252665}


[32m[I 2022-12-05 08:29:53,561][0m Trial 33 finished with value: -0.0998570404020758 and parameters: {'n_estimators': 50, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 1.3748010709911984, 'subsample': 0.7034124675252665}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 37, 'max_depth': 9, 'num_leaves': 10, 'min_child_weight': 0.8960984861494086, 'subsample': 0.6282221142579825}


[32m[I 2022-12-05 08:29:54,341][0m Trial 34 finished with value: -0.14585182193680682 and parameters: {'n_estimators': 37, 'max_depth': 9, 'num_leaves': 10, 'min_child_weight': 0.8960984861494086, 'subsample': 0.6282221142579825}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 63, 'max_depth': 7, 'num_leaves': 8, 'min_child_weight': 0.9675287400959531, 'subsample': 0.6462972113615431}


[32m[I 2022-12-05 08:29:55,536][0m Trial 35 finished with value: -0.1688618638789519 and parameters: {'n_estimators': 63, 'max_depth': 7, 'num_leaves': 8, 'min_child_weight': 0.9675287400959531, 'subsample': 0.6462972113615431}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 26, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 0.6148465413091372, 'subsample': 0.5931320934305157}


[32m[I 2022-12-05 08:29:56,318][0m Trial 36 finished with value: -0.11811887174392424 and parameters: {'n_estimators': 26, 'max_depth': 9, 'num_leaves': 9, 'min_child_weight': 0.6148465413091372, 'subsample': 0.5931320934305157}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 48, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 1.6358623470868763, 'subsample': 0.753118507438866}


[32m[I 2022-12-05 08:29:57,448][0m Trial 37 finished with value: -0.12349229170733918 and parameters: {'n_estimators': 48, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 1.6358623470868763, 'subsample': 0.753118507438866}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 15, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 2.3672398920456668, 'subsample': 0.7038469984060182}


[32m[I 2022-12-05 08:29:58,230][0m Trial 38 finished with value: -0.12031596680957471 and parameters: {'n_estimators': 15, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 2.3672398920456668, 'subsample': 0.7038469984060182}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 37, 'max_depth': 6, 'num_leaves': 3, 'min_child_weight': 0.7130004337142303, 'subsample': 0.7807265508592639}


[32m[I 2022-12-05 08:29:59,384][0m Trial 39 finished with value: -0.09826710948545489 and parameters: {'n_estimators': 37, 'max_depth': 6, 'num_leaves': 3, 'min_child_weight': 0.7130004337142303, 'subsample': 0.7807265508592639}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 18, 'max_depth': 1, 'num_leaves': 10, 'min_child_weight': 1.7828322986470595, 'subsample': 0.7543661606471636}


[32m[I 2022-12-05 08:30:00,078][0m Trial 40 finished with value: -0.14768828552290708 and parameters: {'n_estimators': 18, 'max_depth': 1, 'num_leaves': 10, 'min_child_weight': 1.7828322986470595, 'subsample': 0.7543661606471636}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 34, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 1.2800766877837557, 'subsample': 0.6870220130314253}


[32m[I 2022-12-05 08:30:01,013][0m Trial 41 finished with value: -0.07693624631169597 and parameters: {'n_estimators': 34, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 1.2800766877837557, 'subsample': 0.6870220130314253}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 35, 'max_depth': 9, 'num_leaves': 7, 'min_child_weight': 1.375592033109232, 'subsample': 0.6528865957742256}


[32m[I 2022-12-05 08:30:01,883][0m Trial 42 finished with value: -0.1204782761086852 and parameters: {'n_estimators': 35, 'max_depth': 9, 'num_leaves': 7, 'min_child_weight': 1.375592033109232, 'subsample': 0.6528865957742256}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 44, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 2.247996541811737, 'subsample': 0.6898629469743338}


[32m[I 2022-12-05 08:30:02,742][0m Trial 43 finished with value: -0.09172032349387345 and parameters: {'n_estimators': 44, 'max_depth': 10, 'num_leaves': 6, 'min_child_weight': 2.247996541811737, 'subsample': 0.6898629469743338}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 27, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 1.1164076557695497, 'subsample': 0.7119534103319296}


[32m[I 2022-12-05 08:30:03,385][0m Trial 44 finished with value: -0.10115456026337191 and parameters: {'n_estimators': 27, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 1.1164076557695497, 'subsample': 0.7119534103319296}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 55, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 0.8617608005597318, 'subsample': 0.6695783777575723}


[32m[I 2022-12-05 08:30:04,195][0m Trial 45 finished with value: -0.14281344185713363 and parameters: {'n_estimators': 55, 'max_depth': 8, 'num_leaves': 8, 'min_child_weight': 0.8617608005597318, 'subsample': 0.6695783777575723}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 20, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 0.47101458493880993, 'subsample': 0.7408577310159663}


[32m[I 2022-12-05 08:30:04,788][0m Trial 46 finished with value: -0.11404454959145746 and parameters: {'n_estimators': 20, 'max_depth': 10, 'num_leaves': 7, 'min_child_weight': 0.47101458493880993, 'subsample': 0.7408577310159663}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 10, 'max_depth': 2, 'num_leaves': 6, 'min_child_weight': 0.6852119488257786, 'subsample': 0.6440043860484134}


[32m[I 2022-12-05 08:30:05,295][0m Trial 47 finished with value: -0.15720256916531472 and parameters: {'n_estimators': 10, 'max_depth': 2, 'num_leaves': 6, 'min_child_weight': 0.6852119488257786, 'subsample': 0.6440043860484134}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 39, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 1.4480764130269856, 'subsample': 0.6883661959502225}


[32m[I 2022-12-05 08:30:05,984][0m Trial 48 finished with value: -0.08436940367715481 and parameters: {'n_estimators': 39, 'max_depth': 9, 'num_leaves': 5, 'min_child_weight': 1.4480764130269856, 'subsample': 0.6883661959502225}. Best is trial 28 with value: -0.07416216404075347.[0m


Current_params :  {'n_estimators': 44, 'max_depth': 7, 'num_leaves': 7, 'min_child_weight': 1.997991859135557, 'subsample': 0.7137121747521387}


[32m[I 2022-12-05 08:30:06,738][0m Trial 49 finished with value: -0.09316500418623883 and parameters: {'n_estimators': 44, 'max_depth': 7, 'num_leaves': 7, 'min_child_weight': 1.997991859135557, 'subsample': 0.7137121747521387}. Best is trial 28 with value: -0.07416216404075347.[0m


In [64]:
best_params = study.best_params
best_model = LGBMRegressor(**best_params)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score

# 予測値を算出)
best_model.fit(X_train_scaled, y_train)
train_y_in_cv = cross_val_predict(best_model, X_train_scaled, y_train, cv=10)
predict_y = best_model.predict(X_test_scaled)

fig = plt.figure(figsize=(18.0, 5.0))
ax_cv.set_title("Q2")
ax_cv.set_xlabel('pred') 
ax_cv.set_ylabel('exp')  

ax_ext.set_title("External Validation")
ax_ext.set_xlabel('pred')
ax_ext.set_ylabel('exp') 

ax_cv.scatter(train_y_in_cv, y_train)
ax_ext.scatter(predict_y, y_test)


print('$R^{2}$=' + str(round(r2_score(y_train, train_y_in_cv), 3)))
print('$R^{2}$=' + str(round(r2_score(predict_y, y_test), 3)))
fig.show()

$R^{2}$=0.319
$R^{2}$=-0.818


  fig.show()


<Figure size 1800x500 with 0 Axes>