### make meta features using Mulleken charge data only available for train set

In [1]:
import sys
sys.path.insert(0, "/home/kohei3/anaconda3/envs/tensorflow/lib/python3.6/site-packages/")

import numpy as np
import pandas as pd
import feather
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm

In [2]:
features = [
        "Atom",
        "AtomPosition",
        "AtomDistance",
        "CouplingType",
        "AtomEnvironment",
        "AtomNeighbors",
        "BruteForce",
        "ScalarCouplingContributionsOof2",
        "DistanceFromClosest",
        "ElectroNegFromClosest",
        "ACSF"]

In [3]:
X_train = pd.concat([feather.read_dataframe("../features/" + feature + "_train.feather") for feature in features], axis=1)
X_test = pd.concat([feather.read_dataframe("../features/" + feature + "_test.feather") for feature in features], axis=1)

### a0, a1のみの原子ごとの特徴のみを取り出して電荷予測の特徴として用いる

In [4]:
train = feather.read_dataframe('../data/input/train.feather')
test = feather.read_dataframe('../data/input/test.feather')

In [5]:
train.shape, test.shape

((4658147, 6), (2505542, 5))

In [6]:
X_train.shape, X_test.shape

((4658147, 435), (2505542, 435))

In [7]:
X_train = pd.concat([X_train, train], axis=1)
X_test = pd.concat([X_test, test], axis=1)

In [8]:
struc = feather.read_dataframe('../data/input/structures.feather')
struc.head(20)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602


In [9]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095


In [10]:
############somethigs wrong
def map_atom_info(df, atom_idx):
    df = pd.merge(df, struc, how='left', left_on=['molecule_name', f'atom_index_{atom_idx}'],
                  right_on=['molecule_name', 'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}'})
    return df

train = map_atom_info(train, 0)

In [11]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x,y,z
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,H,1.011731,1.463751,0.000277
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,H,1.011731,1.463751,0.000277
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,H,-0.540815,1.447527,-0.876644
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397


In [12]:
train = map_atom_info(train, 1)

In [13]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_x,y_x,z_x,atom_1,x_y,y_y,z_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,H,1.011731,1.463751,0.000277,H,-0.540815,1.447527,-0.876644
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,H,1.011731,1.463751,0.000277,H,-0.523814,1.437933,0.906397
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,H,-0.540815,1.447527,-0.876644,C,-0.012698,1.085804,0.008001
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,H,-0.540815,1.447527,-0.876644,H,-0.523814,1.437933,0.906397
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,H,-0.523814,1.437933,0.906397,C,-0.012698,1.085804,0.008001


In [14]:
X_train

Unnamed: 0,atom_0,atom_1,x_0,x_1,y_0,y_1,z_0,z_1,atom_distance,type,...,acsf_121_a1,acsf_122_a1,acsf_123_a1,acsf_124_a1,id,molecule_name,atom_index_0,atom_index_1,type.1,scalar_coupling_constant
0,H,C,0.002150,-0.012698,-0.006031,1.085804,0.001976,0.008001,1.544255,1JHC,...,0.0,0.0,0.0,0.0,0,dsgdb9nsd_000001,1,0,1JHC,84.807600
1,H,H,0.002150,1.011731,-0.006031,1.463751,0.001976,0.000277,2.521712,2JHH,...,0.0,0.0,0.0,0.0,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000
2,H,H,0.002150,-0.540815,-0.006031,1.447527,0.001976,-0.876644,2.521751,2JHH,...,0.0,0.0,0.0,0.0,2,dsgdb9nsd_000001,1,3,2JHH,-11.254800
3,H,H,0.002150,-0.523814,-0.006031,1.437933,0.001976,0.906397,2.521764,2JHH,...,0.0,0.0,0.0,0.0,3,dsgdb9nsd_000001,1,4,2JHH,-11.254300
4,H,C,1.011731,-0.012698,1.463751,1.085804,0.000277,0.008001,1.544253,1JHC,...,0.0,0.0,0.0,0.0,4,dsgdb9nsd_000001,2,0,1JHC,84.807400
5,H,H,1.011731,-0.540815,1.463751,1.447527,0.000277,-0.876644,2.521766,2JHH,...,0.0,0.0,0.0,0.0,5,dsgdb9nsd_000001,2,3,2JHH,-11.254100
6,H,H,1.011731,-0.523814,1.463751,1.437933,0.000277,0.906397,2.521753,2JHH,...,0.0,0.0,0.0,0.0,6,dsgdb9nsd_000001,2,4,2JHH,-11.254800
7,H,C,-0.540815,-0.012698,1.447527,1.085804,-0.876644,0.008001,1.544245,1JHC,...,0.0,0.0,0.0,0.0,7,dsgdb9nsd_000001,3,0,1JHC,84.809300
8,H,H,-0.540815,-0.523814,1.447527,1.437933,-0.876644,0.906397,2.521752,2JHH,...,0.0,0.0,0.0,0.0,8,dsgdb9nsd_000001,3,4,2JHH,-11.254300
9,H,C,-0.523814,-0.012698,1.437933,1.085804,0.906397,0.008001,1.544247,1JHC,...,0.0,0.0,0.0,0.0,9,dsgdb9nsd_000001,4,0,1JHC,84.809500


In [15]:
a0_cols = [col for col in X_train.columns if (col in ['atom_0', 'x_0', 'y_0', 'z_0'])
                                       or ('a0_neighbor' in col)
                                       or ('dist_from' in col and 'a0' in col)
                                       or ('en_from' in col and 'a0' in col)
                                       or ('acsf' in col and 'a0' in col)]

a1_cols = [col for col in X_train.columns if (col in ['atom_1', 'x_1', 'y_1', 'z_1'])
                                       or ('a1_neighbor' in col)
                                       or ('dist_from' in col and 'a1' in col)
                                       or ('en_from' in col and 'a1' in col)
                                       or ('acsf' in col and 'a1' in col)]

assert len(a0_cols) == len(a1_cols)

In [16]:
mulliken = feather.read_dataframe("../data/input/mulliken_charges.feather")
mulliken.shape

(1533537, 3)

In [17]:
mulliken.head(20)

Unnamed: 0,molecule_name,atom_index,mulliken_charge
0,dsgdb9nsd_000001,0,-0.535689
1,dsgdb9nsd_000001,1,0.133921
2,dsgdb9nsd_000001,2,0.133922
3,dsgdb9nsd_000001,3,0.133923
4,dsgdb9nsd_000001,4,0.133923
5,dsgdb9nsd_000002,0,-0.707143
6,dsgdb9nsd_000002,1,0.235712
7,dsgdb9nsd_000002,2,0.235712
8,dsgdb9nsd_000002,3,0.23572
9,dsgdb9nsd_000003,0,-0.589706


In [18]:
X_mulliken_a0 = X_train.groupby(['molecule_name', 'atom_index_0']).first()[a0_cols].reset_index()

In [19]:
X_mulliken_a0.columns = ['molecule_name', 'atom_index'] + [f'col_{i}' for i in range(180)]

In [20]:
X_mulliken_a1 = X_train.groupby(['molecule_name', 'atom_index_1']).first()[a1_cols].reset_index()
X_mulliken_a1.columns = ['molecule_name', 'atom_index'] + [f'col_{i}' for i in range(180)]

In [21]:
X_mulliken = pd.concat([X_mulliken_a0, X_mulliken_a1], axis=0).drop_duplicates(keep='first')

In [22]:
X_mulliken.shape

(1405126, 182)

In [23]:
X_mulliken.head(20)

Unnamed: 0,molecule_name,atom_index,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,...,col_170,col_171,col_172,col_173,col_174,col_175,col_176,col_177,col_178,col_179
0,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,dsgdb9nsd_000003,1,H,0.064766,0.020572,0.001535,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,dsgdb9nsd_000005,2,H,-0.027803,2.198949,0.014154,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,dsgdb9nsd_000007,2,H,0.994873,1.939743,0.002941,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
X_mulliken[X_mulliken['molecule_name'] == 'dsgdb9nsd_000001']

Unnamed: 0,molecule_name,atom_index,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,...,col_170,col_171,col_172,col_173,col_174,col_175,col_176,col_177,col_178,col_179
0,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
mulliken = mulliken.merge(X_mulliken, on=['molecule_name', 'atom_index'], how='left')

In [26]:
mulliken.head(30)

Unnamed: 0,molecule_name,atom_index,mulliken_charge,col_0,col_1,col_2,col_3,col_4,col_5,col_6,...,col_170,col_171,col_172,col_173,col_174,col_175,col_176,col_177,col_178,col_179
0,dsgdb9nsd_000001,0,-0.535689,C,-0.012698,1.085804,0.008001,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000001,1,0.133921,H,0.00215,-0.006031,0.001976,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000001,2,0.133922,H,1.011731,1.463751,0.000277,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000001,3,0.133923,H,-0.540815,1.447527,-0.876644,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000001,4,0.133923,H,-0.523814,1.437933,0.906397,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,dsgdb9nsd_000002,0,-0.707143,N,-0.040426,1.024108,0.062564,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,dsgdb9nsd_000002,1,0.235712,H,0.017257,0.012545,-0.027377,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,dsgdb9nsd_000002,2,0.235712,H,0.915789,1.358745,-0.028758,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,dsgdb9nsd_000002,3,0.23572,H,-0.520278,1.343532,-0.775543,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,dsgdb9nsd_000003,0,-0.589706,,,,,,,,...,,,,,,,,,,


In [27]:
#全部NaNのはそもそもtrainに出てこない原子なので落とす
drop_cols = mulliken['col_0'].isnull() 
mulliken = mulliken[~drop_cols]
mulliken.shape

(1405126, 183)

In [28]:
X_mulliken_a0_te = X_test.groupby(['molecule_name', 'atom_index_0']).first()[a0_cols].reset_index()
X_mulliken_a0_te.columns = ['molecule_name', 'atom_index'] + [f'col_{i}' for i in range(180)]
X_mulliken_a1_te = X_test.groupby(['molecule_name', 'atom_index_1']).first()[a1_cols].reset_index()
X_mulliken_a1_te.columns = ['molecule_name', 'atom_index'] + [f'col_{i}' for i in range(180)]
X_mulliken_te = pd.concat([X_mulliken_a0_te, X_mulliken_a1_te], axis=0).drop_duplicates(keep='first')

X_mulliken_te.head(30)

Unnamed: 0,molecule_name,atom_index,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,...,col_170,col_171,col_172,col_173,col_174,col_175,col_176,col_177,col_178,col_179
0,dsgdb9nsd_000004,2,H,-1.661639,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000004,3,H,1.661639,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000015,3,H,1.005284,1.810158,0.004656,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000015,4,H,-0.546896,1.793435,-0.872511,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000015,5,H,-0.530029,1.72292,0.911017,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,dsgdb9nsd_000015,6,H,0.139938,-0.255993,-2.050984,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,dsgdb9nsd_000015,7,H,1.692653,-0.238684,-1.174777,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,dsgdb9nsd_000015,8,H,0.599594,-1.641802,-1.024076,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,dsgdb9nsd_000016,3,H,-0.305415,2.017021,0.925332,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,dsgdb9nsd_000016,4,H,-0.322755,2.026802,-0.893478,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
mulliken.shape

(1405126, 183)

In [30]:
X_mulliken_te.shape

(756113, 182)

In [31]:
#mulliken['col_0'].fillna('H', inplace=True)
#X_mulliken_te['col_0'].fillna('H', inplace=True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
mulliken['col_0'] = le.fit_transform(mulliken['col_0'])
X_mulliken_te['col_0'] = le.transform(X_mulliken_te['col_0'])

In [32]:
train_mulliken_oof = np.zeros(len(mulliken))
test_mulliken_pred = np.zeros(len(X_mulliken_te))


kf = GroupKFold(n_splits=2)

target = mulliken['mulliken_charge']
mulliken.drop('mulliken_charge', axis=1, inplace=True)

cols = [col for col in mulliken.columns if 'col_' in col]

SEED = 42
NUM_ROUNDS = 10000

params = {
        "num_leaves": 90,
        "min_data_in_leaf": 100,
        "objective": "regression",
        "max_depth": 8,
        "learning_rate": 0.2,
        "boosting_type": "gbdt",
        "subsample_freq": 1,
        "subsample": 0.9,
        "metric": "mae",
        "reg_alpha": 0.1,
        "reg_lambda": 0.3, 
        "colsample_bytree": 0.9
        }

for train_idx, val_idx in kf.split(mulliken, groups=mulliken['molecule_name']):
        train_data = lgb.Dataset(mulliken[cols].iloc[train_idx], label=target.iloc[train_idx])
        val_data = lgb.Dataset(mulliken[cols].iloc[val_idx], label=target.iloc[val_idx])
        clf = lgb.train(params, train_data, NUM_ROUNDS, valid_sets=[train_data, val_data],
                        verbose_eval=1000, early_stopping_rounds=100)
        train_mulliken_oof[val_idx] = clf.predict(mulliken[cols].iloc[val_idx], num_iteration=clf.best_iteration)
        test_mulliken_pred += clf.predict(X_mulliken_te[cols], num_iteration=clf.best_iteration) / kf.n_splits

Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.00501168	valid_1's l1: 0.00650204
[2000]	training's l1: 0.00376953	valid_1's l1: 0.00590568
[3000]	training's l1: 0.00307828	valid_1's l1: 0.00563997
[4000]	training's l1: 0.00262009	valid_1's l1: 0.00548653
[5000]	training's l1: 0.00228892	valid_1's l1: 0.00538888
[6000]	training's l1: 0.00203765	valid_1's l1: 0.00532187
[7000]	training's l1: 0.00183723	valid_1's l1: 0.00527154
[8000]	training's l1: 0.00167604	valid_1's l1: 0.00523526
[9000]	training's l1: 0.00154229	valid_1's l1: 0.00520716
[10000]	training's l1: 0.00143053	valid_1's l1: 0.00518477
Did not meet early stopping. Best iteration is:
[10000]	training's l1: 0.00143053	valid_1's l1: 0.00518477
Training until validation scores don't improve for 100 rounds.
[1000]	training's l1: 0.00502161	valid_1's l1: 0.00650142
[2000]	training's l1: 0.00376686	valid_1's l1: 0.00589065
[3000]	training's l1: 0.00307813	valid_1's l1: 0.00562384
[4000]	train

In [33]:
mulliken_train = mulliken[['molecule_name', 'atom_index']]
mulliken_train['mulliken_pred'] = train_mulliken_oof
mulliken_test = X_mulliken_te[['molecule_name', 'atom_index']]
mulliken_test['mulliken_pred'] = test_mulliken_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [34]:
mulliken_train.head()

Unnamed: 0,molecule_name,atom_index,mulliken_pred
0,dsgdb9nsd_000001,0,-0.302682
1,dsgdb9nsd_000001,1,0.078548
2,dsgdb9nsd_000001,2,0.078675
3,dsgdb9nsd_000001,3,0.07577
4,dsgdb9nsd_000001,4,0.075893


In [35]:
mulliken_test.head()

Unnamed: 0,molecule_name,atom_index,mulliken_pred
0,dsgdb9nsd_000004,2,0.165332
1,dsgdb9nsd_000004,3,0.168108
2,dsgdb9nsd_000015,3,0.107479
3,dsgdb9nsd_000015,4,0.107
4,dsgdb9nsd_000015,5,0.128567


In [36]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_x,y_x,z_x,atom_1,x_y,y_y,z_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


In [37]:

train = train.merge(mulliken_train, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], how='left')
train = train.merge(mulliken_train, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], how='left', suffixes=['_a0', '_a1'])

test = test.merge(mulliken_test, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'], how='left')
test = test.merge(mulliken_test, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], how='left', suffixes=['_a0', '_a1'])


In [38]:
train[['mulliken_pred_a0', 'mulliken_pred_a1']].to_feather('../features/MullikenChargePred_train.feather')
test[['mulliken_pred_a0', 'mulliken_pred_a1']].to_feather('../features/MullikenChargePred_test.feather')