# LGBM_V2

Version 2 for LGBM modeling. <br>
Basic Feature Engineering.

## Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import datetime as dt
from sklearn import tree
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Graphics
import matplotlib.pyplot as plt
import seaborn as sns
# Map
import importlib
mpl_toolkits = importlib.import_module('mpl_toolkits')
import mpl_toolkits
#from mpl_toolkits.basemap import Basemap
# Figures inline and set visualization style
%matplotlib inline
sns.set() #Different type of visualization

# Show multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Gradient Boosting
import lightgbm as lgb

# Scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder,PolynomialFeatures

## EDA

In [6]:
# Train Data
train = pd.read_csv('input/train.csv')

# Test Data
test = pd.read_csv('input/test.csv')

# Submission Data
sub = pd.read_csv('input/sample_submission.csv')

structures = pd.read_csv('input/structures.csv')

In [3]:
# We take a first look at the dataset
train.info()
print ('#################################################')
print ('#################################################')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 6 columns):
id                          int64
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 213.2+ MB
#################################################
#################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505542 entries, 0 to 2505541
Data columns (total 5 columns):
id               int64
molecule_name    object
atom_index_0     int64
atom_index_1     int64
type             object
dtypes: int64(3), object(2)
memory usage: 95.6+ MB


### Feature Engineering

#### Map Atoms

In [4]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [7]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

#### Distance Calculation

In [8]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

This will create 2 features: 
- 1) Will show the first letter of the `type`
- 2) Will show the other characters

In [9]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])
train['type_1'] = train['type'].apply(lambda x: x[1:])
test['type_1'] = test['type'].apply(lambda x: x[1:])

In [10]:
train['dist_to_type_mean'] = train['dist'] / train.groupby('type')['dist'].transform('mean')
test['dist_to_type_mean'] = test['dist'] / test.groupby('type')['dist'].transform('mean')

train['dist_to_type_0_mean'] = train['dist'] / train.groupby('type_0')['dist'].transform('mean')
test['dist_to_type_0_mean'] = test['dist'] / test.groupby('type_0')['dist'].transform('mean')

train['dist_to_type_1_mean'] = train['dist'] / train.groupby('type_1')['dist'].transform('mean')
test['dist_to_type_1_mean'] = test['dist'] / test.groupby('type_1')['dist'].transform('mean')

In [11]:
train[f'molecule_type_dist_mean'] = train.groupby(['molecule_name', 'type'])['dist'].transform('mean')
test[f'molecule_type_dist_mean'] = test.groupby(['molecule_name', 'type'])['dist'].transform('mean')

### Label Encoding

In [12]:
categoricals = train.select_dtypes(include='object').columns
categoricals = test.select_dtypes(include='object').columns

In [13]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(train[c].values))
    train[c] = lbl.transform(list(train[c].values))

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [14]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(test[c].values))
    test[c] = lbl.transform(list(test[c].values))

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

## Modeling

In [15]:
# We define de label
y = train['scalar_coupling_constant']

In [16]:
# We save the molecule names
molecules = train.pop('molecule_name')

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4658147 entries, 0 to 4658146
Data columns (total 23 columns):
id                          int64
atom_index_0                int64
atom_index_1                int64
type                        int64
scalar_coupling_constant    float64
atom_0                      int64
x_0                         float64
y_0                         float64
z_0                         float64
atom_1                      int64
x_1                         float64
y_1                         float64
z_1                         float64
dist                        float64
dist_x                      float64
dist_y                      float64
dist_z                      float64
type_0                      int64
type_1                      int64
dist_to_type_mean           float64
dist_to_type_0_mean         float64
dist_to_type_1_mean         float64
molecule_type_dist_mean     float64
dtypes: float64(15), int64(8)
memory usage: 852.9 MB


We drop the useless columns for training

In [18]:
X = train.drop(['id', 'scalar_coupling_constant'], axis=1)
X_test = test.drop(['id', 'molecule_name'], axis=1)

### Set K Folds

In [19]:
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = KFold(n_splits=5, shuffle=True, random_state=8)

In [20]:
params = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'max_depth': 13,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

In [21]:
dtrain = lgb.Dataset(train, label=y)

In [22]:
res_lgbm = lgb.cv(params,
             dtrain,
             num_boost_round = 4000,
             folds=skf,
             seed=2019,
             early_stopping_rounds = 10,
             verbose_eval = 1)

[1]	cv_agg's l1: 19.7861 + 0.0116297
[2]	cv_agg's l1: 15.8294 + 0.00927899
[3]	cv_agg's l1: 12.6641 + 0.00744381
[4]	cv_agg's l1: 10.1322 + 0.00594062
[5]	cv_agg's l1: 8.10687 + 0.0047383
[6]	cv_agg's l1: 6.48674 + 0.00374958
[7]	cv_agg's l1: 5.19092 + 0.00297878
[8]	cv_agg's l1: 4.15452 + 0.00238398
[9]	cv_agg's l1: 3.32575 + 0.00191153
[10]	cv_agg's l1: 2.66313 + 0.00153723
[11]	cv_agg's l1: 2.13351 + 0.00124461
[12]	cv_agg's l1: 1.71034 + 0.00104618
[13]	cv_agg's l1: 1.37228 + 0.000868615
[14]	cv_agg's l1: 1.10232 + 0.000749184
[15]	cv_agg's l1: 0.887004 + 0.000635641
[16]	cv_agg's l1: 0.71554 + 0.000572448
[17]	cv_agg's l1: 0.579407 + 0.000467606
[18]	cv_agg's l1: 0.471883 + 0.000370411
[19]	cv_agg's l1: 0.387428 + 0.000298111
[20]	cv_agg's l1: 0.322135 + 0.000250458
[21]	cv_agg's l1: 0.271786 + 0.000235223
[22]	cv_agg's l1: 0.232909 + 0.000282479
[23]	cv_agg's l1: 0.202969 + 0.00027418
[24]	cv_agg's l1: 0.180039 + 0.000273583
[25]	cv_agg's l1: 0.162751 + 0.000248834
[26]	cv_agg's 

[201]	cv_agg's l1: 0.115788 + 0.000286915
[202]	cv_agg's l1: 0.115775 + 0.00029041
[203]	cv_agg's l1: 0.115766 + 0.000290216
[204]	cv_agg's l1: 0.11576 + 0.000289734
[205]	cv_agg's l1: 0.115759 + 0.000289223
[206]	cv_agg's l1: 0.115743 + 0.000299563
[207]	cv_agg's l1: 0.115736 + 0.000303275
[208]	cv_agg's l1: 0.11572 + 0.000301299
[209]	cv_agg's l1: 0.115705 + 0.000289231
[210]	cv_agg's l1: 0.115704 + 0.000289706
[211]	cv_agg's l1: 0.115694 + 0.000292921
[212]	cv_agg's l1: 0.115664 + 0.000297414
[213]	cv_agg's l1: 0.115652 + 0.000296832
[214]	cv_agg's l1: 0.115647 + 0.000295631
[215]	cv_agg's l1: 0.115635 + 0.000299663
[216]	cv_agg's l1: 0.115629 + 0.000298651
[217]	cv_agg's l1: 0.115617 + 0.000295106
[218]	cv_agg's l1: 0.115609 + 0.000300801
[219]	cv_agg's l1: 0.1156 + 0.000302117
[220]	cv_agg's l1: 0.115591 + 0.000297998
[221]	cv_agg's l1: 0.11557 + 0.000303068
[222]	cv_agg's l1: 0.115566 + 0.000299877
[223]	cv_agg's l1: 0.115561 + 0.000297973
[224]	cv_agg's l1: 0.115548 + 0.00029514

[398]	cv_agg's l1: 0.114524 + 0.000298386
[399]	cv_agg's l1: 0.114521 + 0.000299997
[400]	cv_agg's l1: 0.114517 + 0.000299516
[401]	cv_agg's l1: 0.114511 + 0.000298127
[402]	cv_agg's l1: 0.114508 + 0.000298153
[403]	cv_agg's l1: 0.114504 + 0.000297536
[404]	cv_agg's l1: 0.114499 + 0.000298086
[405]	cv_agg's l1: 0.114494 + 0.00030039
[406]	cv_agg's l1: 0.114484 + 0.000306048
[407]	cv_agg's l1: 0.114479 + 0.000300957
[408]	cv_agg's l1: 0.114479 + 0.000299991
[409]	cv_agg's l1: 0.114478 + 0.000303922
[410]	cv_agg's l1: 0.114475 + 0.000303875
[411]	cv_agg's l1: 0.114472 + 0.000303857
[412]	cv_agg's l1: 0.114467 + 0.000302832
[413]	cv_agg's l1: 0.114457 + 0.000309379
[414]	cv_agg's l1: 0.114454 + 0.000308862
[415]	cv_agg's l1: 0.11445 + 0.000308378
[416]	cv_agg's l1: 0.114449 + 0.000307399
[417]	cv_agg's l1: 0.114446 + 0.000310448
[418]	cv_agg's l1: 0.114442 + 0.000309403
[419]	cv_agg's l1: 0.114441 + 0.00030586
[420]	cv_agg's l1: 0.114439 + 0.000304376
[421]	cv_agg's l1: 0.114436 + 0.00030

[595]	cv_agg's l1: 0.113979 + 0.000338365
[596]	cv_agg's l1: 0.113973 + 0.000338621
[597]	cv_agg's l1: 0.11397 + 0.000340188
[598]	cv_agg's l1: 0.113967 + 0.000341923
[599]	cv_agg's l1: 0.113965 + 0.000339843
[600]	cv_agg's l1: 0.113964 + 0.000341853
[601]	cv_agg's l1: 0.11396 + 0.000339763
[602]	cv_agg's l1: 0.113957 + 0.000338042
[603]	cv_agg's l1: 0.113956 + 0.000336708
[604]	cv_agg's l1: 0.113952 + 0.000336577
[605]	cv_agg's l1: 0.113952 + 0.000335615
[606]	cv_agg's l1: 0.11395 + 0.000333501
[607]	cv_agg's l1: 0.113947 + 0.000335625
[608]	cv_agg's l1: 0.113943 + 0.000325226
[609]	cv_agg's l1: 0.113937 + 0.000324568
[610]	cv_agg's l1: 0.113937 + 0.000324628
[611]	cv_agg's l1: 0.113932 + 0.00032196
[612]	cv_agg's l1: 0.113932 + 0.000322029
[613]	cv_agg's l1: 0.113931 + 0.000322528
[614]	cv_agg's l1: 0.113924 + 0.000324743
[615]	cv_agg's l1: 0.113925 + 0.000324947
[616]	cv_agg's l1: 0.113924 + 0.00032455
[617]	cv_agg's l1: 0.113924 + 0.000323681
[618]	cv_agg's l1: 0.113921 + 0.0003250

[792]	cv_agg's l1: 0.113616 + 0.000300357
[793]	cv_agg's l1: 0.113618 + 0.000299935
[794]	cv_agg's l1: 0.11361 + 0.000302259
[795]	cv_agg's l1: 0.113609 + 0.000301616
[796]	cv_agg's l1: 0.113607 + 0.000300678
[797]	cv_agg's l1: 0.113607 + 0.0003006
[798]	cv_agg's l1: 0.113607 + 0.000299821
[799]	cv_agg's l1: 0.113605 + 0.000301103
[800]	cv_agg's l1: 0.113606 + 0.000300807
[801]	cv_agg's l1: 0.113605 + 0.00029895
[802]	cv_agg's l1: 0.113603 + 0.000298809
[803]	cv_agg's l1: 0.1136 + 0.000301911
[804]	cv_agg's l1: 0.113601 + 0.000301973
[805]	cv_agg's l1: 0.113599 + 0.000301975
[806]	cv_agg's l1: 0.113598 + 0.000302309
[807]	cv_agg's l1: 0.113598 + 0.000301508
[808]	cv_agg's l1: 0.113596 + 0.000302744
[809]	cv_agg's l1: 0.113596 + 0.000303576
[810]	cv_agg's l1: 0.113596 + 0.000303856
[811]	cv_agg's l1: 0.113594 + 0.000301925
[812]	cv_agg's l1: 0.113592 + 0.000298647
[813]	cv_agg's l1: 0.113591 + 0.000300191
[814]	cv_agg's l1: 0.113588 + 0.000300825
[815]	cv_agg's l1: 0.113584 + 0.00030177

[989]	cv_agg's l1: 0.113392 + 0.000309267
[990]	cv_agg's l1: 0.11339 + 0.00030987
[991]	cv_agg's l1: 0.113391 + 0.000309246
[992]	cv_agg's l1: 0.113392 + 0.000308874
[993]	cv_agg's l1: 0.113391 + 0.000308746
[994]	cv_agg's l1: 0.11339 + 0.000308726
[995]	cv_agg's l1: 0.113389 + 0.000307815
[996]	cv_agg's l1: 0.11339 + 0.000306335
[997]	cv_agg's l1: 0.113387 + 0.00030705
[998]	cv_agg's l1: 0.113387 + 0.000309649
[999]	cv_agg's l1: 0.113386 + 0.000310394
[1000]	cv_agg's l1: 0.113388 + 0.000309812
[1001]	cv_agg's l1: 0.11338 + 0.000313547
[1002]	cv_agg's l1: 0.11338 + 0.000313681
[1003]	cv_agg's l1: 0.113378 + 0.000312639
[1004]	cv_agg's l1: 0.113377 + 0.000312043
[1005]	cv_agg's l1: 0.113377 + 0.000312913
[1006]	cv_agg's l1: 0.113377 + 0.000311726
[1007]	cv_agg's l1: 0.113379 + 0.000312172
[1008]	cv_agg's l1: 0.113378 + 0.000312324
[1009]	cv_agg's l1: 0.113379 + 0.00031149
[1010]	cv_agg's l1: 0.113378 + 0.000310924
[1011]	cv_agg's l1: 0.113374 + 0.000310801
[1012]	cv_agg's l1: 0.113374 +

In [23]:
best_round=[i for i, e in enumerate(res_lgbm['l1-mean']) if e == min(res_lgbm['l1-mean'])][0]

In [24]:
best_round

1144

In [25]:
res_lgbm['l1-mean'][best_round]

0.11327660513303354

In [26]:
model = lgb.train(params, dtrain, num_boost_round = best_round)

In [41]:
ypred = model.predict(test)

In [42]:
# Predictions --> That is: Scalar Coupling Constants
ypred

array([ 0.09595575,  1.21062784, -1.1674803 , ...,  0.05924222,
        0.09375196,  0.70175996])

### Get Back id column

In [43]:
# We need to get back the id column
para_id = pd.read_csv('input/test.csv', sep = ',')

In [44]:
para_id.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type'], dtype='object')

In [45]:
para_id.drop(columns = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type']);

In [46]:
para_id.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type'], dtype='object')

### Submission

In [47]:
test['id'] = para_id['id'].astype(object)

In [48]:
test.columns

Index(['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'atom_0',
       'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist', 'dist_x',
       'dist_y', 'dist_z', 'type_0', 'type_1', 'dist_to_type_mean',
       'dist_to_type_0_mean', 'dist_to_type_1_mean', 'molecule_type_dist_mean',
       'scalar_coupling_constant', 'id'],
      dtype='object')

In [49]:
test['scalar_coupling_constant'] = ypred
sub_check = test[['id', 'scalar_coupling_constant']]

In [50]:
sub_check.to_csv('TRR_lgbm_Molecular_Properties_2.csv', index=False, header=True)

Score of XXXXXXXXXXX
Basic Feature Engineering