In [1]:
import pandas as pd
import numpy as np
import setting
import gc

gc.enable()

DATA = '~/Data/Molecular'

data = pd.concat([
    pd.read_pickle(f"{DATA}/basic.gz"),
    pd.read_pickle(f"{DATA}/angle_feature.gz"),
    pd.read_pickle(f"{DATA}/criskiev_distance_feature.gz"),
    pd.read_pickle(f"{DATA}/qm9.gz")
], axis = 1)

In [2]:
data.drop(columns=[
    'id',
    'molecule_name',
    'atom_index_0',
    'atom_index_1',
    'type',
    'atom_0',
    'x_0',
    'y_0',
    'z_0',
    'atom_1',
    'x_1',
    'y_1',
    'z_1'
], inplace=True)
test = data.iloc[4658147:, :]
data = data.iloc[:4658147, :]

In [3]:
train = pd.read_csv(f"{DATA}/train.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}, usecols=['type', 'scalar_coupling_constant'])
y = train.scalar_coupling_constant

In [None]:
from lightgbm import LGBMRegressor

model = {}

for bond in pd.unique(train['type']):
    X = data[train['type'] == bond]
    Y = y[train['type'] == bond]
    lgbm = LGBMRegressor(objective='regression_l1', n_estimators=5000, learning_rate=0.1, subsample_freq=1, \
                     feature_fraction=0.7, subsample=0.7, reg_alpha=0.1, reg_lambda=0.3, device_type='gpu',
                    **setting.param[bond])
    lgbm.fit(X, Y, eval_metric='regression_l1', verbose=100)
    model[bond] = lgbm

In [11]:
test_ = pd.read_csv(f"{DATA}/test.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category'
}, usecols=['type'])

In [14]:
test = test.reset_index(drop=True)

In [16]:
submission = pd.read_csv(f"{DATA}/sample_submission.csv")
for bond in pd.unique(train['type']):
    X = test[test_['type'] == bond]
    Y = model[bond].predict(X)
    submission.loc[test_['type'] == bond, 'scalar_coupling_constant'] = Y

In [17]:
submission.to_csv("submission_08_25_01.csv", index=False)