### 演習13）分子動力学シミュレーションのサンプルデータにある300種類のポリマーの構造物性相関データを用いて，密度，定圧熱容量，熱伝導率，線膨張係数をの予測モデルを構築せよ．以下の説明やサンプルコードを参考にモデルの作成方法を自ら工夫し，その結果を考察せよ．

ライブラリの導入

In [1]:
import os
import numpy as np
import pandas as pd
import pickle as pk

import matplotlib.pyplot as plt

from xenonpy.descriptor import Fingerprints
from xenonpy.datatools import Splitter, Scaler

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


データの導入

In [2]:
dir_file = 'data/Book_data_MD.csv'
data = pd.read_csv(dir_file)


ディスクリプタ/フィンガープリントを用意する

モデル入力のために、カウント型ECFPとMordredを組み合わせます。これらを別々に計算するのではなく、XenonPyモジュールを用いてカスタマイズされた記述子計算機を作成する。これにより、同じ記述子セットの再計算が容易になる。

In [3]:
# prepare customized descriptor function

from xenonpy.descriptor.base import BaseDescriptor
from xenonpy.descriptor import ECFP
from xenonpy.contrib.extend_descriptors.descriptor import Mordred2DDescriptor

class CustomDesc(BaseDescriptor):
    def __init__(self, n_jobs=-1, on_errors='nan', input_type='smiles'):
        super().__init__()
        self.n_jobs = n_jobs

        self.rdkit_fp = ECFP(n_jobs, on_errors=on_errors, input_type=input_type, return_type='df', radius=3, n_bits=2048, counting=True)
        self.rdkit_fp = Mordred2DDescriptor(on_errors=on_errors, return_type='df')

fp_fcn = CustomDesc()


In [4]:
# calculate descriptors
fp = fp_fcn.transform(data['SMILES'])


  4%|█▌                                        | 11/300 [00:02<00:51,  5.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 13%|█████▍                                    | 39/300 [00:02<00:10, 24.47it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 37%|███████████████▏                         | 111/300 [00:04<00:05, 32.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 47%|███████████████████▍                     | 142/300 [00:05<00:03, 40.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 75%|██████████████████████████████▌          | 224/300 [00:07<00:01, 38.89it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████| 300/300 [00:08<00:00, 34.28it/s]


In [5]:
# check NA values in the descriptors
idx_na = fp.isna().sum() > 0
print(fp.isna().sum()[idx_na])

# remove all descriptors with NA values and all-zero values
idx_0 = (fp == 0).all()

# remove all descriptors with error message from Mordred (i.e., not INT or FLOAT)
idx_error = [key for key, val in fp.dtypes.iteritems() if not (val == int or val == float)]

fp_filtered = fp.loc[:, ~(idx_na | idx_0)].drop(idx_error, axis=1)
print(f'Final num. of descriptors = {fp_filtered.shape[1]}')


ETA_alpha         300
AETA_alpha        300
ETA_shape_p       300
ETA_shape_y       300
ETA_shape_x       300
ETA_eta           300
AETA_eta          300
ETA_eta_L         300
AETA_eta_L        300
ETA_eta_F         300
AETA_eta_F        300
ETA_eta_FL        300
AETA_eta_FL       300
ETA_dAlpha_A      300
ETA_dAlpha_B      300
ETA_epsilon_1     300
ETA_epsilon_2     300
ETA_epsilon_4     300
ETA_epsilon_5     300
ETA_dEpsilon_A    300
ETA_dEpsilon_B    300
ETA_dEpsilon_C    300
ETA_dEpsilon_D    300
ETA_psi_1         300
ETA_dPsi_A        300
ETA_dPsi_B        300
VMcGowan          300
apol              300
bpol              300
dtype: int64
Final num. of descriptors = 1929


予測結果のプロット用関数を用意する。

In [6]:
# x-axis: observation data, y-axis: prediction
def plot_prediction(x_tr, y_tr, x_te, y_te, dir_file):
    xy_min = min(np.concatenate([x_tr, x_te, y_tr, y_te]))
    xy_max = max(np.concatenate([x_tr, x_te, y_tr, y_te]))
    xy_del = xy_max - xy_min
    
    _ = plt.figure(figsize=(5,5))
    _ = plt.scatter(x_tr, y_tr, s=20, c='k', marker='x', alpha=0.5, label='Training')
    _ = plt.scatter(x_te, y_te, s=20, c='k', alpha=0.6, label='Test')
    _ = plt.rc('xtick',labelsize=14)
    _ = plt.rc('ytick',labelsize=14)
    _ = plt.xlabel('Prediction', fontsize=14)
    _ = plt.ylabel('Observation', fontsize=14)
    _ = plt.legend(fontsize=14)
    _ = plt.plot([xy_min,xy_max],[xy_min,xy_max],ls="--",c='k')
    _ = plt.savefig(dir_file, dpi = 500, bbox_inches = "tight")
    _ = plt.close()
    

データの各物性に対してランダムフォレストモデルを学習させる。

[実行に数分かかる]

In [7]:
%%time

# grids for hyperparameters
n_tree = [50, 100, 200]
max_feat_r = [0.1, 0.3, 0.5, 0.7]

# directories
dir_plot = 'output/演習13/Results'
dir_mdl = 'output/演習13/Models'
os.makedirs(dir_plot, exist_ok=True)
os.makedirs(dir_mdl, exist_ok=True)

# setup
x_all = fp_filtered
summary = pd.DataFrame(index=data.drop('SMILES', axis=1).columns, columns=['RMSE_train', 'R2_train', 'RMSE_test', 'R2_test'])

# fixing random seed for reproducibility
np.random.seed (202202)

for prop, y_all in data.drop('SMILES', axis=1).iteritems():
    # split data into training (all) and test
    sp_test = Splitter(len(y_all), test_size=0.2)
    x_train, x_test, y_train, y_test = sp_test.split(x_all, y_all.to_frame())

    # scale test data
    y_scaler = Scaler().standard()
    y_train_s = y_scaler.fit_transform(y_train)
    y_test_s = y_scaler.transform(y_test)
    
    # train model with hyperparameter tuning
    max_feat = np.round([x*x_train.shape[1] for x in max_feat_r]).astype(int)
    parameters = {'n_estimators': n_tree, 'max_features': max_feat}
    mdl = GridSearchCV(RFR(), parameters, scoring='neg_mean_squared_error')
    mdl.fit(x_train, y_train_s.values.flatten()) # requires 1D vector for model training
    
    # save trained model
    with open(f'{dir_mdl}/{prop}.pkl', 'wb') as f:
        pk.dump({'model': mdl, 'scaler': y_scaler, 'splitter': sp_test, 'descriptor': x_train.columns.values}, f)
    
    # make predictions for training and test data
    prd_train = y_scaler.inverse_transform(mdl.predict(x_train).reshape(-1,1)).flatten()
    prd_test = y_scaler.inverse_transform(mdl.predict(x_test).reshape(-1,1)).flatten()

    # get performance statistics
    summary.loc[prop, 'RMSE_train'] = np.sqrt(mean_squared_error(prd_train, y_train.values.flatten()))
    summary.loc[prop, 'R2_train'] = r2_score(prd_train, y_train.values.flatten())
    summary.loc[prop, 'RMSE_test'] = np.sqrt(mean_squared_error(prd_test, y_test.values.flatten()))
    summary.loc[prop, 'R2_test'] = r2_score(prd_test, y_test.values.flatten())

    if prop == 'thermal_conductivity':
        break
        
    # save plot
    file_name = f'{dir_plot}/{prop}.png'
    plot_prediction(prd_train, y_train.values.flatten(), prd_test, y_test.values.flatten(), file_name)
    
# save statistics
summary.to_csv(f'{dir_plot}/prediction_summary.csv')


CPU times: user 1min 8s, sys: 357 ms, total: 1min 8s
Wall time: 1min 8s


In [8]:
file_name = f'{dir_plot}/{prop}.png'
plot_prediction(prd_train, y_train.values.flatten(), prd_test, y_test.values.flatten(), file_name)
