In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import gc
gc.enable()

In [2]:
DATA = '/home/ike/Data/Molecular/'

data = pd.concat([
    pd.read_pickle("basic.gz"),
    pd.read_pickle("angle_feature.gz")
], axis = 1)
data.drop(columns=[
    'id',
    'molecule_name',
    'atom_index_0',
    'atom_index_1',
    'type',
    'atom_0',
    'x_0',
    'y_0',
    'z_0',
    'atom_1',
    'x_1',
    'y_1',
    'z_1'
    
], inplace=True)
X = data.iloc[:4658147, :]
train = pd.read_csv(f"{DATA}train.csv")

In [4]:
y = train.scalar_coupling_constant

In [5]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

Xtrain, Xtest, y_train, y_test = train_test_split(X[train['type'] == '1JHC'], y[train['type'] == '1JHC'], test_size=0.3)

In [21]:
lgbm = LGBMRegressor(objective='regression_l1', n_estimators=5000, num_leaves=512, min_child_samples=40, max_depth=16, learning_rate=0.075, subsample_freq=1, \
                     feature_fraction=0.7, subsample=0.7, reg_alpha=0.1, reg_lambda=0.3)

In [None]:
lgbm.fit(Xtrain, y_train, eval_set=[(Xtrain, y_train), (Xtest, y_test)], eval_metric='l1', verbose=100)

[100]	training's l1: 1.35097	training's l1: 1.35097	valid_1's l1: 1.46763	valid_1's l1: 1.46763
[200]	training's l1: 1.14758	training's l1: 1.14758	valid_1's l1: 1.3379	valid_1's l1: 1.3379


In [4]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

params = {'n_estimators': [2000, 3000, 4000, 5000]}
folds = KFold(n_splits=5, random_state=49)
lgbm = LGBMRegressor(objective='huber', n_estimators=5000, num_leaves=512, min_child_samples=80, learning_rate=0.1, subsample_freq=1, \
                     feature_fraction=0.7, bagging_fraction=0.7, reg_alpha=0.1, reg_lambda=0.3)

y = train.scalar_coupling_constant

In [5]:
loss = []
model = {}

for bond in pd.unique(train['type']):
    Xb = X[train['type'] == bond]
    Yb = y[train['type'] == bond]
    grid_rr = GridSearchCV(lgbm, params, scoring='neg_mean_absolute_error', cv=folds, return_train_score=True, verbose=True)
    grid_rr.fit(Xb, Yb, verbose=100)
    model[bond] = grid_rr
    loss.append(grid_rr.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 459.9min finished


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [6]:
print(f"Final loss: {sum(map(lambda itm: np.log(-itm), loss))/8}")

Final loss: -0.5274424526205538


In [7]:
print(f"Final loss: {sum(map(lambda itm: np.log(-itm), loss))/8}")

Final loss: -0.5274424526205538


In [None]:
model

In [8]:
Xtest = pd.read_csv("test_with_feature.csv").drop(columns=['type'])
test = pd.read_csv(f"{DATA}test.csv")

In [9]:
submission = pd.read_csv(f"{DATA}sample_submission.csv")
for bond in pd.unique(train['type']):
    Xt = Xtest[test['type'] == bond]
    Yt = model[bond].predict(Xt)
    submission.loc[test['type'] == bond, 'scalar_coupling_constant'] = Yt

In [10]:
submission.to_csv("submission_08_01_01.csv", index=False)

In [12]:
model[0].estimator

KeyError: 0

In [19]:
model['1JHC'].best_estimator_.booster_.save_model('mode.txt')

<lightgbm.basic.Booster at 0x7f86b81777b8>

In [21]:
model['1JHC'].best_estimator_

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
       importance_type='split', learning_rate=0.2, max_depth=9,
       min_child_samples=80, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=5000, n_jobs=-1, num_leaves=128,
       objective='regression_l1', random_state=None, reg_alpha=0.1,
       reg_lambda=0.3, silent=True, subsample=0.9,
       subsample_for_bin=200000, subsample_freq=1)

In [20]:
model['1JHC'].estimator

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
       importance_type='split', learning_rate=0.2, max_depth=9,
       min_child_samples=79, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=5000, n_jobs=-1, num_leaves=128,
       objective='regression_l1', random_state=None, reg_alpha=0.1,
       reg_lambda=0.3, silent=True, subsample=0.9,
       subsample_for_bin=200000, subsample_freq=1)

In [22]:
for bond in pd.unique(train['type']):
    model[bond].best_estimator_.booster_.save_model(f"{bond}_model.txt")