In [1]:
import numpy as np
import pandas as pd
import feather
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#実際予測時にはその化合物のデータは使えないので、厳密にはGroupKfoldで化合物ごとに分けるべき

In [3]:
features = ["Atom", "AtomPosition", "AtomDistance", "CouplingType", "RdkitDescriptors", "AtomEnvironment", "AtomNeighbors"]

In [4]:
X_train = pd.concat([feather.read_dataframe("../features/" + feature + "_train.feather") for feature in features], axis=1)
X_test = pd.concat([feather.read_dataframe("../features/" + feature + "_test.feather") for feature in features], axis=1)

In [5]:
X_train.shape, X_test.shape

((4658147, 281), (2505542, 281))

In [6]:
X_train.head()

Unnamed: 0,atom_0,atom_1,x_0,x_1,y_0,y_1,z_0,z_1,atom_distance,type,...,a1_neighbor_in_ring4_mean,a1_neighbor_in_ring4_count,a1_neighbor_in_ring5_mean,a1_neighbor_in_ring5_count,a1_neighbor_in_ring6_mean,a1_neighbor_in_ring6_count,a1_neighbor_in_ring7_mean,a1_neighbor_in_ring7_count,a1_neighbor_in_ring8_mean,a1_neighbor_in_ring8_count
0,H,C,0.00215,-0.012698,-0.006031,1.085804,0.001976,0.008001,1.544255,1JHC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,H,H,0.00215,1.011731,-0.006031,1.463751,0.001976,0.000277,2.521712,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H,H,0.00215,-0.540815,-0.006031,1.447527,0.001976,-0.876644,2.521751,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H,H,0.00215,-0.523814,-0.006031,1.437933,0.001976,0.906397,2.521764,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H,C,1.011731,-0.012698,1.463751,1.085804,0.000277,0.008001,1.544253,1JHC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
contri = feather.read_dataframe("../data/input/scalar_coupling_contributions.feather")

In [8]:
contri.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


In [9]:
X_train.drop('PropertyFunctor', axis=1, inplace=True) #always nan
X_test.drop('PropertyFunctor', axis=1, inplace=True) #always nan

#drop molucules in train with nan descriptors
#replace nan in test with the mean of train
coupling_types = X_train['type']
nan_cols = list(X_train.columns.values[X_train.isnull().any(axis=0)])
coupling_types = coupling_types[~X_train.isnull().any(axis=1)]
contri = contri[~X_train.isnull().any(axis=1)]
X_train = X_train[~X_train.isnull().any(axis=1)]
categorical_cols = list(X_train.columns[X_train.dtypes == object])

for col in nan_cols:
    if col in categorical_cols:
        mode = X_train[col].dropna().mode()
        X_test[col].fillna(mode[0], inplace=True)
    else:
        median = X_train[col].dropna().median()
        X_test[col].fillna(median, inplace=True)

for col in categorical_cols:
    le = LabelEncoder()
    print(f'Starting {col}')
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

Starting atom_0
Starting atom_1
Starting type
Starting a1_is_aromatic
Starting a1_in_ring
Starting a1_in_ring3
Starting a1_in_ring4
Starting a1_in_ring5
Starting a1_in_ring6
Starting a1_in_ring7
Starting a1_in_ring8


In [11]:
oof_train = pd.DataFrame()
oof_test = pd.DataFrame()

for col in ['fc', 'sd', 'pso', 'dso']:
        kf = GroupKFold(n_splits=3)
    
        oof = np.zeros(len(X_train))
        pred = np.zeros(len(X_test))
        
        target = contri[col]
        
        SEED = 42
        NUM_ROUNDS = 10000
        
        params = {
                            "num_leaves": 100,
                            "min_data_in_leaf": 100,
                            "objective": "regression",
                            "max_depth": 10,
                            "learning_rate": 0.2,
                            "boosting_type": "gbdt",
                            "subsample_freq": 1,
                            "subsample": 0.9,
                            "metric": "mae",
                            "reg_alpha": 0.1,
                            "reg_lambda": 0.3, 
                            "colsample_bytree": 0.9
                            }
    
        for train_idx, val_idx in kf.split(X_train, groups=contri['molecule_name']):
                train_data = lgb.Dataset(X_train.iloc[train_idx], label=target.iloc[train_idx], categorical_feature=categorical_cols)
                val_data = lgb.Dataset(X_train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_cols)
                clf = lgb.train(params, train_data, NUM_ROUNDS, valid_sets=[train_data, val_data],
                                verbose_eval=1000, early_stopping_rounds=100)
                oof[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration)
                pred += clf.predict(X_test, num_iteration=clf.best_iteration) / kf.n_splits
                
        oof_train[col] = oof
        oof_test[col] = pred
        
        print(f'{col} done!')



Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.918869	valid_1's l1: 0.974821
[2000]	training's l1: 0.81509	valid_1's l1: 0.91028
[3000]	training's l1: 0.751751	valid_1's l1: 0.879558
[4000]	training's l1: 0.704045	valid_1's l1: 0.860976
[5000]	training's l1: 0.664313	valid_1's l1: 0.847119
[6000]	training's l1: 0.630933	valid_1's l1: 0.83706
[7000]	training's l1: 0.601017	valid_1's l1: 0.828804
[8000]	training's l1: 0.574628	valid_1's l1: 0.822035
[9000]	training's l1: 0.550884	valid_1's l1: 0.816702
[10000]	training's l1: 0.528969	valid_1's l1: 0.811871
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.528969	valid_1's l1: 0.811871
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.919683	valid_1's l1: 0.971284
[2000]	training's l1: 0.815164	valid_1's l1: 0.906163
[3000]	training's l1: 0.753073	valid_1's l1: 0.876797
[4000]	training's l1: 0.704774	valid_1's l1: 0.857182
[5000]	training's



Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.00984684	valid_1's l1: 0.0104491
[2000]	training's l1: 0.00877	valid_1's l1: 0.00978372
[3000]	training's l1: 0.00808549	valid_1's l1: 0.00943917
[4000]	training's l1: 0.00757717	valid_1's l1: 0.00921769
[5000]	training's l1: 0.00717109	valid_1's l1: 0.00907399
[6000]	training's l1: 0.00681693	valid_1's l1: 0.00895798
[7000]	training's l1: 0.00650564	valid_1's l1: 0.00886467
[8000]	training's l1: 0.00623364	valid_1's l1: 0.00879146
[9000]	training's l1: 0.00597616	valid_1's l1: 0.0087193
[10000]	training's l1: 0.00574673	valid_1's l1: 0.00866101
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.00574673	valid_1's l1: 0.00866101
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.00992103	valid_1's l1: 0.0105172
[2000]	training's l1: 0.00887564	valid_1's l1: 0.00987564
[3000]	training's l1: 0.0081979	valid_1's l1: 0.00953169
[4000]	training's l



Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.0344358	valid_1's l1: 0.0369588
[2000]	training's l1: 0.0302255	valid_1's l1: 0.0344019
[3000]	training's l1: 0.0276345	valid_1's l1: 0.0331416
[4000]	training's l1: 0.0256775	valid_1's l1: 0.0323211
[5000]	training's l1: 0.0241156	valid_1's l1: 0.0317672
[6000]	training's l1: 0.0227956	valid_1's l1: 0.0313441
[7000]	training's l1: 0.021669	valid_1's l1: 0.0310239
[8000]	training's l1: 0.0206528	valid_1's l1: 0.0307498
[9000]	training's l1: 0.0197312	valid_1's l1: 0.0305226
[10000]	training's l1: 0.0188988	valid_1's l1: 0.0303364
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.0188988	valid_1's l1: 0.0303364
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.0345528	valid_1's l1: 0.0369387
[2000]	training's l1: 0.030372	valid_1's l1: 0.034415
[3000]	training's l1: 0.0277453	valid_1's l1: 0.0331244
[4000]	training's l1: 0.0258004	valid_1's l



Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.029544	valid_1's l1: 0.0324425
[2000]	training's l1: 0.0253644	valid_1's l1: 0.0300342
[3000]	training's l1: 0.0228093	valid_1's l1: 0.0288385
[4000]	training's l1: 0.0209716	valid_1's l1: 0.0281394
[5000]	training's l1: 0.0194905	valid_1's l1: 0.0276247
[6000]	training's l1: 0.0182605	valid_1's l1: 0.0272322
[7000]	training's l1: 0.0172242	valid_1's l1: 0.0269589
[8000]	training's l1: 0.0163028	valid_1's l1: 0.0267168
[9000]	training's l1: 0.0155087	valid_1's l1: 0.0265414
[10000]	training's l1: 0.0147768	valid_1's l1: 0.0263787
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.0147768	valid_1's l1: 0.0263787
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.0296131	valid_1's l1: 0.0323109
[2000]	training's l1: 0.0255577	valid_1's l1: 0.0299892
[3000]	training's l1: 0.0229508	valid_1's l1: 0.0287445
[4000]	training's l1: 0.0210964	valid_1's

In [12]:
oof_train.shape, oof_test.shape

((4625786, 4), (2505542, 4))

In [13]:
oof_train.head()

Unnamed: 0,fc,sd,pso,dso
0,82.644941,0.189357,0.856272,0.46597
1,-10.347675,0.363358,2.725744,-2.897002
2,-10.771037,0.372108,2.576495,-2.857371
3,-10.953115,0.368697,2.628033,-2.867212
4,80.794899,0.186046,0.827075,0.469087


In [15]:
contri.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


In [16]:
#X_trainのnanのところにnanを加える（他と形式を合わすため）

In [33]:
X_train = pd.concat([feather.read_dataframe("../features/" + feature + "_train.feather") for feature in features], axis=1)
X_train.drop('PropertyFunctor', axis=1, inplace=True) #always nan

In [34]:
X_train.head()

Unnamed: 0,atom_0,atom_1,x_0,x_1,y_0,y_1,z_0,z_1,atom_distance,type,...,a1_neighbor_in_ring4_mean,a1_neighbor_in_ring4_count,a1_neighbor_in_ring5_mean,a1_neighbor_in_ring5_count,a1_neighbor_in_ring6_mean,a1_neighbor_in_ring6_count,a1_neighbor_in_ring7_mean,a1_neighbor_in_ring7_count,a1_neighbor_in_ring8_mean,a1_neighbor_in_ring8_count
0,H,C,0.00215,-0.012698,-0.006031,1.085804,0.001976,0.008001,1.544255,1JHC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,H,H,0.00215,1.011731,-0.006031,1.463751,0.001976,0.000277,2.521712,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H,H,0.00215,-0.540815,-0.006031,1.447527,0.001976,-0.876644,2.521751,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H,H,0.00215,-0.523814,-0.006031,1.437933,0.001976,0.906397,2.521764,2JHH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H,C,1.011731,-0.012698,1.463751,1.085804,0.000277,0.008001,1.544253,1JHC,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
X_train.shape

(4658147, 280)

In [36]:
all_index = X_train.index
nan_index = all_index[X_train.isnull().any(axis=1)]

assert len(all_index) - len(nan_index) == len(oof_train)

In [44]:
feats_train = [[] for _ in range(4)]

count = 0
for idx in all_index:
    if idx in nan_index:
        for j in range(4):
            feats_train[j].append(np.nan)
    else:
        for k in range(4):
            feats_train[k].append(oof_train.iloc[count, k])
        count += 1
            
new_train_oof = pd.DataFrame()
new_train_oof['fc'] = feats_train[0]
new_train_oof['sd'] = feats_train[1]
new_train_oof['pso'] = feats_train[2]
new_train_oof['dso'] = feats_train[3]

In [45]:
new_train_oof.shape

(4658147, 4)

In [46]:
new_train_oof.head()

Unnamed: 0,fc,sd,pso,dso
0,82.644941,0.189357,0.856272,0.46597
1,-10.347675,0.363358,2.725744,-2.897002
2,-10.771037,0.372108,2.576495,-2.857371
3,-10.953115,0.368697,2.628033,-2.867212
4,80.794899,0.186046,0.827075,0.469087


In [47]:
new_train_oof.to_feather('../features/ScalarCouplingContributionsOof_train.feather')
oof_test.to_feather('../features/ScalarCouplingContributionsOof_test.feather')