In [1]:
import pandas as pd
import numpy as np
import setting
from lightgbm import LGBMRegressor
import gc

gc.enable()

DATA = '~/Data/Molecular'

data = pd.concat([
    pd.read_pickle(f"{DATA}/basic.gz"),
    #pd.read_pickle(f"{DATA}/angle_feature.gz"),
    #pd.read_pickle(f"{DATA}/criskiev_distance_feature.gz"),
    #pd.read_pickle(f"{DATA}/qm9.gz")
], axis = 1)

data.drop(columns=[
    'id',
    'molecule_name',
    'atom_index_0',
    'atom_index_1',
    'type',
    'atom_0',
    'x_0',
    'y_0',
    'z_0',
    'atom_1',
    'x_1',
    'y_1',
    'z_1'
], inplace=True)
tdata = data.iloc[4658147:, :]
tdata = tdata.reset_index(drop=True)
data = data.iloc[:4658147, :]



In [3]:
train = pd.read_csv(f"{DATA}/train.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}, usecols=['molecule_name', 'type', 'scalar_coupling_constant'])
test = pd.read_csv(f"{DATA}/test.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category'
}, usecols=['type'])
y = train.scalar_coupling_constant

In [24]:
submission = pd.read_csv(f"{DATA}/sample_submission.csv")

In [31]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

for bond in pd.unique(train['type']):
    X = data[train['type'] == bond]
    Y = y[train['type'] == bond]
    y_pred = np.zeros(tdata.shape[0], dtype='float32')
    gkf = GroupKFold(2)
    for i, (it, iv) in enumerate(gkf.split(X, Y, groups=train[train['type']==bond].molecule_name)):
        lgbm = LGBMRegressor(objective='regression_l1', n_estimators=10000, learning_rate=0.1, subsample_freq=1, \
                         feature_fraction=0.7, subsample=0.7, reg_alpha=0.1, reg_lambda=0.3, device_type='gpu',
                        **setting.param['1JHC'])
        lgbm.fit(
            X.iloc[it],
            Y.iloc[it],
            eval_set=[(X.iloc[it], Y.iloc[it]), (X.iloc[iv], Y.iloc[iv])],
            eval_metric='regression_l1',
            verbose=100,
            early_stopping_rounds=200
        )
        print(f"In fold {i}, a model training stopped at iteration {lgbm.best_iteration_} with score {lgbm.best_score_['valid_1']}. Dumping model as {bond}_fold{i}.lightgbm")
        lgbm.booster_.save_model(f"{bond}_fold{i}.lightgbm")
        y_pred += lgbm.predict(tdata[test['type'] == bond])
    y_pred /= 2
    submission.loc[test['type'] == bond, 'scalar_coupling_constant'] = y_pred

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's l1: 1.29481	valid_1's l1: 1.54186
[200]	valid_0's l1: 1.09396	valid_1's l1: 1.46876
[300]	valid_0's l1: 0.986395	valid_1's l1: 1.44148
[400]	valid_0's l1: 0.9104	valid_1's l1: 1.42368
[500]	valid_0's l1: 0.855832	valid_1's l1: 1.41157
[600]	valid_0's l1: 0.816453	valid_1's l1: 1.40404
[700]	valid_0's l1: 0.785048	valid_1's l1: 1.39858
[800]	valid_0's l1: 0.759677	valid_1's l1: 1.39467
[900]	valid_0's l1: 0.737345	valid_1's l1: 1.39049
[1000]	valid_0's l1: 0.718415	valid_1's l1: 1.38737
[1100]	valid_0's l1: 0.702783	valid_1's l1: 1.38489
[1200]	valid_0's l1: 0.688465	valid_1's l1: 1.3826
[1300]	valid_0's l1: 0.675825	valid_1's l1: 1.38064
[1400]	valid_0's l1: 0.664991	valid_1's l1: 1.37912
[1500]	valid_0's l1: 0.65475	valid_1's l1: 1.37774
[1600]	valid_0's l1: 0.645104	valid_1's l1: 1.3762
[1700]	valid_0's l1: 0.63655	valid_1's l1: 1.37495
[1800]	valid_0's l1: 0.628379	valid_1's l1: 1.37369
[1900]	valid_0's l

ValueError: operands could not be broadcast together with shapes (2505542,) (380609,) (2505542,) 

In [30]:
X.iloc[it]

Unnamed: 0,dist_x,dist_y,dist_z,dist,dist_x_molecule_mean,dist_x_molecule_std,dist_x_molecule_min,dist_x_molecule_max,dist_x_molecule_median,dist_x_molecule_skew,...,neighbour_1JHN_1,neighbour_2JHC_1,neighbour_2JHH_1,neighbour_2JHN_1,neighbour_3JHC_1,neighbour_3JHH_1,neighbour_3JHN_1,neighbour_C_1,neighbour_H_1,neighbour_N_1
19,1.013577,0.414161,0.007492,1.094954,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
26,0.523372,0.398029,0.875550,1.094958,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
32,0.506537,0.388591,0.889591,1.094968,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
38,0.523383,0.398026,0.875545,1.094958,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
42,1.013580,0.414152,0.007510,1.094953,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
45,0.506522,0.388588,0.889600,1.094968,0.832736,0.488885,0.016835,2.006349,0.544191,0.542955,...,0.0,0.500000,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
56,1.015643,0.409896,0.007488,1.095263,0.624872,0.502691,0.013976,1.540026,0.543674,0.511793,...,0.0,0.000000,0.0,0.0,0.250000,0.0,0.0,0.0,1.0,0.0
61,0.524383,0.393654,0.877306,1.095266,0.624872,0.502691,0.013976,1.540026,0.543674,0.511793,...,0.0,0.000000,0.0,0.0,0.250000,0.0,0.0,0.0,1.0,0.0
65,0.507512,0.383986,0.891387,1.095256,0.624872,0.502691,0.013976,1.540026,0.543674,0.511793,...,0.0,0.000000,0.0,0.0,0.250000,0.0,0.0,0.0,1.0,0.0
70,0.013976,1.061343,0.005755,1.061451,0.624872,0.502691,0.013976,1.540026,0.543674,0.511793,...,0.0,0.000000,0.0,0.0,0.750000,0.0,0.0,0.0,1.0,0.0


In [28]:
iv

array([     0,      1,      2, ..., 709390, 709391, 709392])

In [20]:
for it, iv in gkf.split(data[train['type']=='3JHC'], y[train['type']=='3JHC'], groups=train[train['type']=='3JHC'].molecule_name):
    pass

In [21]:
it

array([     11,      12,      13, ..., 1510376, 1510377, 1510378])

In [22]:
iv

array([      0,       1,       2, ..., 1510316, 1510317, 1510318])