In [36]:
import pandas as pd

# CSV読み込み
df = pd.read_csv("modified_molecules.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,SMILES_mody,Unnamed: 3
0,279.0,C=CC(=O)OCc1ccccc1,C=CC(=O)OCc1ccccc1,Poly(benzyl acrylate)
1,383.0,C=CC(=O)Oc2ccc(c1ccccc1)cc2,C=CC(=O)Oc2ccc(c1ccccc1)cc2,Poly(4-biphenyl acrylate)
2,219.0,CCCCOC(=O)C=C,CCCCOC(=O)C=C,Poly(butyl acrylate)
3,250.0,CC(OC(=O)C=C)CC,CC(OC(=O)C=C)CC,Poly(sec-butyl acrylate)
4,345.0,C=CC(=O)Oc1ccccc1C(C)(C)C,C=CC(=O)Oc1ccccc1C(C)(C)C,Poly(2-tertbutylphenyl acrylate)


In [37]:
# カラムの名前変更
df = df.rename(columns={'Unnamed: 0': 'Tg', 'Unnamed: 3':'polymer_name'})

In [38]:
# 前処理前shape
df.shape

(392, 4)

In [39]:
# NaN, 空白消去
df = df.dropna()
df.shape

(352, 4)

In [40]:
# ユニークなSMILESの数(？！)
len(df["SMILES"].unique())

225

In [41]:
# ユニークなpolymer_nameの数(？！)
len(df["polymer_name"].unique())

235

In [42]:
# polymer_nameを全部小文字にする(表記ブレを防止したかった)
df["polymer_name"] = [s.lower() for s in df["polymer_name"]]

In [43]:
# あまり減らない...
len(df["polymer_name"].unique())

234

In [44]:
# 重複行を削除
df = (df.drop_duplicates())

In [45]:
# 前処理後shape
df.shape

(235, 4)

In [46]:
# ライブラリ読み込み
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
import numpy as np

In [47]:
# SMILESをRdKitのMOLオブジェクトに変換
mols = []
for i in df["SMILES"]:
    mol = Chem.MolFromSmiles(i)
    mols.append(mol)

In [48]:
# フィンガープリントを生成: 半径20のMorganフィンガープリント
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 20) for m in mols]

In [49]:
# RDKitの明示的なベクトルをNumpyのアレイに変換
np_fps = []
for fp in fps:
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fp, arr)
  np_fps.append(arr)

In [50]:
# データ分割
from sklearn.model_selection import train_test_split

X = np_fps
Y = df["Tg"]
(x_train, x_test,
 y_train, y_test) = train_test_split(X, Y, train_size=0.9,test_size=0.1, random_state=0)

In [51]:
# 100個のツリーをもつランダムフォレスト分類器を手に入れます
rf = RandomForestRegressor(n_estimators=100, random_state=0)
# ランダムフォレストを訓練
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [52]:
# ランダムフォレストの決定係数
from sklearn.metrics import r2_score
y_pred = rf.predict(x_test)
r2_score(y_test, y_pred)

0.7020715513431172

In [53]:
# SVCを訓練
svc = LinearSVR()
svc.fit(x_train,y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [54]:
# SVRの決定係数
y_pred = svc.predict(x_test)
r2_score(y_test, y_pred)

-0.09558595612933285

In [55]:
# Lassoを訓練
lasso = Lasso(alpha=0.01)
lasso.fit(x_train, y_train)

  positive)


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [56]:
# Lassoの決定係数
y_pred = lasso.predict(x_test)
r2_score(y_test, y_pred)

0.8881898852765312