# Main Script for LGBM Model

## Imports

In [2]:
# Imports
import numpy as np
import pandas as pd
import datetime as dt
from sklearn import tree
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Graphics
import matplotlib.pyplot as plt
import seaborn as sns
# Map
import importlib
mpl_toolkits = importlib.import_module('mpl_toolkits')
import mpl_toolkits
#from mpl_toolkits.basemap import Basemap
# Figures inline and set visualization style
%matplotlib inline
sns.set() #Different type of visualization

# Show multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Gradient Boosting
import lightgbm as lgb

# Scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder,PolynomialFeatures

## EDA

In [3]:
# Train Data
train = pd.read_csv('input/train.csv')

# Test Data
test = pd.read_csv('input/test.csv')

# Submission Data
sub = pd.read_csv('input/sample_submission.csv')

In [4]:
# We take a first look at the dataset
train.info()
print ('#################################################')
print ('#################################################')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 6 columns):
id                          int64
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 213.2+ MB
#################################################
#################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505542 entries, 0 to 2505541
Data columns (total 5 columns):
id               int64
molecule_name    object
atom_index_0     int64
atom_index_1     int64
type             object
dtypes: int64(3), object(2)
memory usage: 95.6+ MB


### Label Encoding

In [5]:
categoricals = train.select_dtypes(include='object').columns
categoricals = test.select_dtypes(include='object').columns

In [6]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(train[c].values))
    train[c] = lbl.transform(list(train[c].values))

LabelEncoder()

LabelEncoder()

In [7]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(test[c].values))
    test[c] = lbl.transform(list(test[c].values))

LabelEncoder()

LabelEncoder()

## Modeling

In [9]:
# We define de label
y = train['scalar_coupling_constant']

In [10]:
# We save the molecule names
molecules = train.pop('molecule_name')

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 5 columns):
id                          int64
atom_index_0                int64
atom_index_1                int64
type                        int64
scalar_coupling_constant    float64
dtypes: float64(1), int64(4)
memory usage: 177.7 MB


We drop the useless columns for training

In [13]:
X = train.drop(['id', 'scalar_coupling_constant'], axis=1)
X_test = test.drop(['id', 'molecule_name'], axis=1)

### Set K Folds

In [20]:
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = KFold(n_splits=5, shuffle=True, random_state=8)

In [16]:
params = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'max_depth': 13,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

In [19]:
dtrain = lgb.Dataset(train, label=y)

In [21]:
res_lgbm = lgb.cv(params,
             dtrain,
             num_boost_round = 4000,
             folds=skf,
             seed=2019,
             early_stopping_rounds = 10,
             verbose_eval = 1)

[1]	cv_agg's l1: 19.7862 + 0.0113885
[2]	cv_agg's l1: 15.83 + 0.00892235
[3]	cv_agg's l1: 12.6656 + 0.00683997
[4]	cv_agg's l1: 10.1343 + 0.00527953
[5]	cv_agg's l1: 8.10949 + 0.00398672
[6]	cv_agg's l1: 6.4907 + 0.00309477
[7]	cv_agg's l1: 5.19759 + 0.00243788
[8]	cv_agg's l1: 4.16511 + 0.00209148
[9]	cv_agg's l1: 3.34116 + 0.00182206
[10]	cv_agg's l1: 2.68333 + 0.00163302
[11]	cv_agg's l1: 2.15801 + 0.00143636
[12]	cv_agg's l1: 1.73858 + 0.00131678
[13]	cv_agg's l1: 1.40368 + 0.00121837
[14]	cv_agg's l1: 1.13662 + 0.00110321
[15]	cv_agg's l1: 0.923933 + 0.000953468
[16]	cv_agg's l1: 0.754688 + 0.000853028
[17]	cv_agg's l1: 0.620207 + 0.00082931
[18]	cv_agg's l1: 0.513872 + 0.000844547
[19]	cv_agg's l1: 0.430299 + 0.000915743
[20]	cv_agg's l1: 0.365529 + 0.000905745
[21]	cv_agg's l1: 0.315538 + 0.00093223
[22]	cv_agg's l1: 0.276831 + 0.000954659
[23]	cv_agg's l1: 0.246874 + 0.000908766
[24]	cv_agg's l1: 0.223981 + 0.000889578
[25]	cv_agg's l1: 0.206569 + 0.00088934
[26]	cv_agg's l1: 0

[202]	cv_agg's l1: 0.157777 + 0.000850063
[203]	cv_agg's l1: 0.157759 + 0.000848714
[204]	cv_agg's l1: 0.157748 + 0.000842008
[205]	cv_agg's l1: 0.15773 + 0.00083762
[206]	cv_agg's l1: 0.157728 + 0.00082975
[207]	cv_agg's l1: 0.157708 + 0.00082565
[208]	cv_agg's l1: 0.157693 + 0.000832772
[209]	cv_agg's l1: 0.157675 + 0.0008246
[210]	cv_agg's l1: 0.157668 + 0.000831104
[211]	cv_agg's l1: 0.157666 + 0.000829652
[212]	cv_agg's l1: 0.157657 + 0.00082821
[213]	cv_agg's l1: 0.157647 + 0.000817921
[214]	cv_agg's l1: 0.157641 + 0.000829683
[215]	cv_agg's l1: 0.157623 + 0.000838396
[216]	cv_agg's l1: 0.157609 + 0.000848265
[217]	cv_agg's l1: 0.157588 + 0.00085779
[218]	cv_agg's l1: 0.157581 + 0.000843034
[219]	cv_agg's l1: 0.157532 + 0.000846669
[220]	cv_agg's l1: 0.157504 + 0.000841337
[221]	cv_agg's l1: 0.157499 + 0.000847173
[222]	cv_agg's l1: 0.157487 + 0.000842467
[223]	cv_agg's l1: 0.157463 + 0.000845894
[224]	cv_agg's l1: 0.15744 + 0.000844983
[225]	cv_agg's l1: 0.157437 + 0.000844386
[

[399]	cv_agg's l1: 0.155529 + 0.000791463
[400]	cv_agg's l1: 0.155515 + 0.000794171
[401]	cv_agg's l1: 0.155497 + 0.00080679
[402]	cv_agg's l1: 0.15549 + 0.000805554
[403]	cv_agg's l1: 0.155484 + 0.000804494
[404]	cv_agg's l1: 0.15547 + 0.000793992
[405]	cv_agg's l1: 0.155464 + 0.000792897
[406]	cv_agg's l1: 0.155458 + 0.000789555
[407]	cv_agg's l1: 0.155454 + 0.000781834
[408]	cv_agg's l1: 0.155456 + 0.000787456
[409]	cv_agg's l1: 0.155445 + 0.000787821
[410]	cv_agg's l1: 0.155418 + 0.000786523
[411]	cv_agg's l1: 0.15541 + 0.000789453
[412]	cv_agg's l1: 0.155411 + 0.000798389
[413]	cv_agg's l1: 0.155412 + 0.000796357
[414]	cv_agg's l1: 0.155409 + 0.000793767
[415]	cv_agg's l1: 0.155402 + 0.000785123
[416]	cv_agg's l1: 0.155401 + 0.000777435
[417]	cv_agg's l1: 0.155391 + 0.0007736
[418]	cv_agg's l1: 0.155385 + 0.000770762
[419]	cv_agg's l1: 0.155374 + 0.000781204
[420]	cv_agg's l1: 0.155372 + 0.000787539
[421]	cv_agg's l1: 0.155359 + 0.00077309
[422]	cv_agg's l1: 0.155361 + 0.000777236

[596]	cv_agg's l1: 0.154318 + 0.000823817
[597]	cv_agg's l1: 0.154317 + 0.000816028
[598]	cv_agg's l1: 0.154313 + 0.000817421
[599]	cv_agg's l1: 0.154284 + 0.000801999
[600]	cv_agg's l1: 0.154281 + 0.000803799
[601]	cv_agg's l1: 0.154279 + 0.000811007
[602]	cv_agg's l1: 0.154282 + 0.000806958
[603]	cv_agg's l1: 0.154279 + 0.000803546
[604]	cv_agg's l1: 0.154274 + 0.000809716
[605]	cv_agg's l1: 0.154271 + 0.000805768
[606]	cv_agg's l1: 0.154274 + 0.000800528
[607]	cv_agg's l1: 0.154272 + 0.000795859
[608]	cv_agg's l1: 0.154261 + 0.00079773
[609]	cv_agg's l1: 0.154256 + 0.00080351
[610]	cv_agg's l1: 0.154257 + 0.000804366
[611]	cv_agg's l1: 0.154248 + 0.000795658
[612]	cv_agg's l1: 0.154243 + 0.000799849
[613]	cv_agg's l1: 0.154238 + 0.000809033
[614]	cv_agg's l1: 0.154232 + 0.000813077
[615]	cv_agg's l1: 0.154219 + 0.000809486
[616]	cv_agg's l1: 0.154188 + 0.000774675
[617]	cv_agg's l1: 0.154192 + 0.000775108
[618]	cv_agg's l1: 0.154194 + 0.000775198
[619]	cv_agg's l1: 0.154188 + 0.0007

[792]	cv_agg's l1: 0.153455 + 0.00079401
[793]	cv_agg's l1: 0.153454 + 0.000794867
[794]	cv_agg's l1: 0.153453 + 0.000794488
[795]	cv_agg's l1: 0.153451 + 0.000801647
[796]	cv_agg's l1: 0.153451 + 0.000807997
[797]	cv_agg's l1: 0.153444 + 0.000809834
[798]	cv_agg's l1: 0.153439 + 0.000810798
[799]	cv_agg's l1: 0.153431 + 0.000801683
[800]	cv_agg's l1: 0.153438 + 0.000794988
[801]	cv_agg's l1: 0.153436 + 0.000796021
[802]	cv_agg's l1: 0.153437 + 0.000790013
[803]	cv_agg's l1: 0.153433 + 0.000797499
[804]	cv_agg's l1: 0.153432 + 0.000800731
[805]	cv_agg's l1: 0.15343 + 0.000803654
[806]	cv_agg's l1: 0.153417 + 0.000802895
[807]	cv_agg's l1: 0.153421 + 0.000800949
[808]	cv_agg's l1: 0.153424 + 0.000804549
[809]	cv_agg's l1: 0.153428 + 0.000798403
[810]	cv_agg's l1: 0.153418 + 0.000811416
[811]	cv_agg's l1: 0.15342 + 0.000811869
[812]	cv_agg's l1: 0.15342 + 0.00081923
[813]	cv_agg's l1: 0.153411 + 0.00081724
[814]	cv_agg's l1: 0.153393 + 0.000827504
[815]	cv_agg's l1: 0.153391 + 0.00082517

[989]	cv_agg's l1: 0.152886 + 0.000875953
[990]	cv_agg's l1: 0.152881 + 0.000870872
[991]	cv_agg's l1: 0.152865 + 0.000867519
[992]	cv_agg's l1: 0.15287 + 0.000874236
[993]	cv_agg's l1: 0.152866 + 0.000879783
[994]	cv_agg's l1: 0.152862 + 0.000883
[995]	cv_agg's l1: 0.152864 + 0.000880635
[996]	cv_agg's l1: 0.152872 + 0.000878578
[997]	cv_agg's l1: 0.152866 + 0.0008843
[998]	cv_agg's l1: 0.15286 + 0.000885856
[999]	cv_agg's l1: 0.152858 + 0.000884475
[1000]	cv_agg's l1: 0.152864 + 0.000876284
[1001]	cv_agg's l1: 0.152861 + 0.000873639
[1002]	cv_agg's l1: 0.152858 + 0.000874354
[1003]	cv_agg's l1: 0.152851 + 0.000867906
[1004]	cv_agg's l1: 0.152853 + 0.000864054
[1005]	cv_agg's l1: 0.152848 + 0.000869376
[1006]	cv_agg's l1: 0.152841 + 0.000866973
[1007]	cv_agg's l1: 0.152839 + 0.000857491
[1008]	cv_agg's l1: 0.152832 + 0.000855295
[1009]	cv_agg's l1: 0.152836 + 0.000844539
[1010]	cv_agg's l1: 0.152831 + 0.000845876
[1011]	cv_agg's l1: 0.152829 + 0.000850131
[1012]	cv_agg's l1: 0.152836 

[1181]	cv_agg's l1: 0.152489 + 0.000890565
[1182]	cv_agg's l1: 0.152486 + 0.00089673
[1183]	cv_agg's l1: 0.152487 + 0.000891097
[1184]	cv_agg's l1: 0.152482 + 0.000890031
[1185]	cv_agg's l1: 0.152485 + 0.00088044
[1186]	cv_agg's l1: 0.152482 + 0.00087995
[1187]	cv_agg's l1: 0.152478 + 0.000889152
[1188]	cv_agg's l1: 0.152478 + 0.00087987
[1189]	cv_agg's l1: 0.152474 + 0.000884254
[1190]	cv_agg's l1: 0.152468 + 0.000880728
[1191]	cv_agg's l1: 0.152461 + 0.0008742
[1192]	cv_agg's l1: 0.15246 + 0.000857896
[1193]	cv_agg's l1: 0.152458 + 0.000859762
[1194]	cv_agg's l1: 0.152453 + 0.000863356
[1195]	cv_agg's l1: 0.152451 + 0.000862599
[1196]	cv_agg's l1: 0.152457 + 0.000867829
[1197]	cv_agg's l1: 0.152452 + 0.000869703
[1198]	cv_agg's l1: 0.152446 + 0.000869377
[1199]	cv_agg's l1: 0.15244 + 0.000866674
[1200]	cv_agg's l1: 0.152444 + 0.000870614
[1201]	cv_agg's l1: 0.152447 + 0.000872809
[1202]	cv_agg's l1: 0.152439 + 0.000873242
[1203]	cv_agg's l1: 0.152432 + 0.00086971
[1204]	cv_agg's l1: 

In [30]:
best_round=[i for i, e in enumerate(res_lgbm['l1-mean']) if e == min(res_lgbm['l1-mean'])][0]

In [31]:
best_round

1205

In [33]:
res_lgbm['l1-mean'][best_round]

0.15242093370090068

In [34]:
model = lgb.train(params, dtrain, num_boost_round = best_round)

In [35]:
ypred = model.predict(test)

In [38]:
# Predictions --> That is: Scalar Coupling Constants
ypred

array([ 4.73407349,  2.89259693,  6.13466528, ...,  5.03466216,
        1.9927508 , -0.02857073])