## Modeling
- data preparation for modeling
- design machine learning model

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

In [2]:
from glob import glob
from tqdm import tqdm

In [3]:
path = 'data/'

sample = pd.read_csv(path + 'sample_submission.csv')
train = pd.read_csv(path + 'train_set.ReorgE.csv')
test = pd.read_csv(path + 'test_set.csv')

In [4]:
train

Unnamed: 0,index,SMILES,Reorg_g,Reorg_ex
0,train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.535060
1,train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
2,train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
3,train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
4,train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.313820,0.338862
...,...,...,...,...
18152,train_18152,CC(=O)Nc1ccc2ccc3cccc4ccc1c2c34,0.146917,0.143084
18153,train_18153,CC(C)(C)c1ccccc1N(c1ccccc1)c1ccc(S(=O)(=O)c2cc...,0.612898,0.500668
18154,train_18154,CN(C)c1ccc(C(=O)Nc2ccccc2)cc1,1.218777,1.048954
18155,train_18155,c1ccc(N(c2ccccc2)c2ccc(-c3ncc(-c4ccc(-c5cnc(-c...,0.145292,0.182589


### Preparing Data for Modeling
- reorganize train dataset
    - define feature and label
    - train test split

In [5]:
train_g = pd.read_csv('train_g.csv')
train_ex = pd.read_csv('train_ex.csv')

In [6]:
train_g['Reorg'] = train['Reorg_g']
train_ex['Reorg'] = train['Reorg_ex']

In [7]:
train_g.head()

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en,Reorg
0,7.421214,3.849057,7.234868,2.479811,12.726113,1.02434,1528.964906,1644.488302,0.898466,1.274554,416.872727,0.631486
1,4.567176,3.675676,6.821243,2.412162,12.740162,1.001057,1366.06,1476.818108,0.852533,1.289784,384.307692,0.825901
2,7.972815,4.24,8.01064,2.4936,12.7968,1.050043,1254.924,1363.2092,0.808378,1.298106,409.16,1.463943
3,9.343431,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.263025,406.348837,0.166669
4,6.102327,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.263479,435.333333,0.31382


In [8]:
train_ex.head()

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en,Reorg
0,7.419384,3.849057,7.234868,2.479811,12.726113,1.02434,1528.964906,1644.488302,0.898466,1.277462,416.872727,0.53506
1,4.48008,3.675676,6.821243,2.412162,12.740162,1.001057,1366.06,1476.818108,0.852533,1.292906,384.307692,1.116781
2,7.975893,4.24,8.01064,2.4936,12.7968,1.050043,1254.924,1363.2092,0.808378,1.303473,409.16,0.964848
3,9.34453,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.265426,406.348837,0.161458
4,6.113824,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.265641,435.333333,0.338862


In [9]:
train_set = pd.concat([train_g, train_ex], ignore_index=True)
train_set

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en,Reorg
0,7.421214,3.849057,7.234868,2.479811,12.726113,1.024340,1528.964906,1644.488302,0.898466,1.274554,416.872727,0.631486
1,4.567176,3.675676,6.821243,2.412162,12.740162,1.001057,1366.060000,1476.818108,0.852533,1.289784,384.307692,0.825901
2,7.972815,4.240000,8.010640,2.493600,12.796800,1.050043,1254.924000,1363.209200,0.808378,1.298106,409.160000,1.463943
3,9.343431,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.263025,406.348837,0.166669
4,6.102327,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.263479,435.333333,0.313820
...,...,...,...,...,...,...,...,...,...,...,...,...
36309,6.685352,4.121212,7.857818,2.453939,12.351697,1.062406,2094.273939,2248.339394,1.236662,1.295104,435.972222,0.143084
36310,6.538838,4.018868,7.595132,2.434906,12.452491,1.051346,1820.125849,1960.608113,1.108458,1.312676,424.123810,0.500668
36311,6.649195,3.764706,7.067824,2.440294,12.622176,1.014688,1698.430000,1824.688235,1.000305,1.263567,425.971429,1.048954
36312,13.415828,4.517647,8.670588,2.442353,12.262494,1.092062,2090.795294,2254.348824,1.299893,1.318533,426.408602,0.182589


In [10]:
train_set.to_csv('train_set.csv', index=False)

#### define feature and label

In [11]:
train_set = pd.read_csv('train_set.csv')

In [12]:
train_set.columns

Index(['dist', 'at_n', 'at_w', 'el_neg', 'ion_en', 'el_aff', 'mp', 'bp', 'den',
       'len', 'bond_en', 'Reorg'],
      dtype='object')

In [8]:
features = ['dist', 'at_n', 'at_w', 'el_neg', 'ion_en',
            'el_aff', 'mp', 'bp', 'den', 'len', 'bond_en']
label = 'Reorg'

#### train test split
- systematic sampling

In [14]:
train_df = train_set[0::2].reset_index()
test_df = train_set[1::2].reset_index()

In [15]:
X_train, y_train = train_df[features], train_df[label]
X_test, y_test = test_df[features], test_df[label]

### Modeling and Evaluation

In [6]:
from sklearn.ensemble import RandomForestRegressor as rf

In [17]:
model = rf(n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1)

In [18]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8801287164244126, 0.5375262692158845)

#### log transformation

In [19]:
train_set['Reorg'] = np.log(train_set['Reorg'])

In [20]:
train_df = train_set[0::2].reset_index()
test_df = train_set[1::2].reset_index()

In [22]:
X_train, y_train = train_df[features], train_df[label]
X_test, y_test = test_df[features], test_df[label]

In [23]:
log_m = rf(n_jobs=-1)
log_m.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1)

In [24]:
log_m.score(X_train, y_train), log_m.score(X_test, y_test)

(0.8811399177784986, 0.5697468009809621)

## Submission
- prepare test_set
- modeling and evaluation

### Preparing test_set

In [25]:
test_g = pd.read_csv('test_g.csv')
test_ex = pd.read_csv('test_ex.csv')

In [26]:
test_set = pd.concat([test_g, test_ex], ignore_index=True)

In [27]:
test_set.to_csv('test_set.csv', index=False)

### Modeling and Evaluation

In [28]:
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')

In [29]:
train_set.head()

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en,Reorg
0,7.421214,3.849057,7.234868,2.479811,12.726113,1.02434,1528.964906,1644.488302,0.898466,1.274554,416.872727,0.631486
1,4.567176,3.675676,6.821243,2.412162,12.740162,1.001057,1366.06,1476.818108,0.852533,1.289784,384.307692,0.825901
2,7.972815,4.24,8.01064,2.4936,12.7968,1.050043,1254.924,1363.2092,0.808378,1.298106,409.16,1.463943
3,9.343431,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.263025,406.348837,0.166669
4,6.102327,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.263479,435.333333,0.31382


In [30]:
test_set.head()

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en
0,7.426456,4.44,8.570584,2.4898,12.45606,1.19658,2106.6196,2296.4882,1.270908,1.330743,436.148148
1,4.764177,4.25,8.091417,2.56,12.976333,1.0283,1295.1425,1394.86,0.756032,1.265567,412.84
2,6.825332,4.139535,7.847698,2.45093,12.555395,1.046525,1710.036279,1843.89186,1.049928,1.299365,410.782609
3,16.311959,3.294118,6.038225,2.394118,12.691098,0.983584,1509.683922,1622.687451,0.889138,1.270826,406.127451
4,10.627192,4.439024,8.475951,2.418293,12.265366,1.087375,1984.332683,2144.283415,1.262043,1.328075,423.227273


In [31]:
X_train, y_train = train_set[features], train_set[label]
X_test = test_set

In [32]:
model = rf(n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1)

In [33]:
pred = model.predict(X_test)

In [34]:
sample['Reorg_g'] = pred[:457]
sample['Reorg_ex'] = pred[457:]

In [35]:
sample.to_csv('submission_2.csv', index=False)

#### log transformed model

In [21]:
sample = pd.read_csv(path + 'sample_submission.csv')
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')

In [22]:
X_train, y_train = train_set[features], np.log1p(train_set[label])
X_test = test_set

In [23]:
model = rf(n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1)

In [24]:
pred = np.expm1(model.predict(X_test))

In [25]:
sample['Reorg_g'] = pred[:457]
sample['Reorg_ex'] = pred[457:]

In [26]:
sample.to_csv('submission_3.csv', index=False)