<a href="https://colab.research.google.com/github/tram-tr/predicting-molecular-properties/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# need authorization code to mount drive
from google.colab import drive
drive.mount('/content/drive')
%env KAGGLE_CONFIG_DIR=/content/drive/My Drive/Colab Notebooks/Kaggle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
env: KAGGLE_CONFIG_DIR=/content/drive/My Drive/Colab Notebooks/Kaggle


In [17]:
!pip install kaggle
!kaggle competitions download -c champs-scalar-coupling

champs-scalar-coupling.zip: Skipping, found more recently modified local copy (use --force to force download)


In [18]:
!pip install ase



In [15]:
#!unzip /content/champs-scalar-coupling.zip

In [139]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import math
from numpy.linalg import svd
import ase
from ase import Atoms
import ase.visualize
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
import logging
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.WARNING)

In [20]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [43]:
train = pd.read_csv('train.csv', nrows=500_000)
test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')
contributions = pd.read_csv('scalar_coupling_contributions.csv')
mulliken = pd.read_csv('mulliken_charges.csv')
test_mulliken = pd.read_csv('mulliken_charges_test_set.csv')

In [44]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [45]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4659076,dsgdb9nsd_000004,2,0,2JHC
1,4659077,dsgdb9nsd_000004,2,1,1JHC
2,4659078,dsgdb9nsd_000004,2,3,3JHH
3,4659079,dsgdb9nsd_000004,3,0,1JHC
4,4659080,dsgdb9nsd_000004,3,1,2JHC


In [46]:
train = pd.merge(train, contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
test = pd.merge(test, contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [47]:
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71}
fudge_factor = 0.05
atomic_radius_fudge = {k:v + fudge_factor for k,v in atomic_radius.items()}
electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}
max_atoms = 28

In [48]:
atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]
structures['e_neg'] = atoms_en
structures['rad'] = atoms_rad

  0%|          | 0/2358875 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

In [49]:
atom_idx = structures['atom_index'].values
coords = structures[['x', 'y', 'z']].values
molecules = structures['molecule_name'].values
rads = structures['rad'].values
n_rows = np.arange(len(structures))

# calculate bond distances
bonds = np.zeros((len(structures) + 1, max_atoms + 1), dtype=np.int8)
bond_dists = np.zeros((len(structures) + 1, max_atoms + 1), dtype=np.float32)

coords_temp = coords
molecules_temp = molecules
rads_temp = rads

for i in tqdm(range(max_atoms - 1)):
  coords_temp = np.roll(coords_temp, -1, axis=0)
  molecules_temp = np.roll(molecules_temp, -1, axis=0)
  rads_temp = np.roll(rads_temp, -1, axis=0)

  mask = np.where(molecules==molecules_temp, 1, 0)
  dists = np.linalg.norm(coords - coords_temp, axis=1) * mask
  rad_bond = rads + rads_temp

  bond = np.where(np.logical_and(dists > 0.0001, dists < rad_bond), 1, 0)

  source_row = n_rows
  target_row = source_row + i + 1
  target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row)

  source_atom = atom_idx
  target_atom = atom_idx + i + 1
  target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom)

  bonds[(source_row, target_atom)] = bond
  bonds[(target_row, source_atom)] = bond
  bond_dists[(source_row, target_atom)] = dists
  bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) # remove dummy row
bonds = np.delete(bonds, axis=1, obj=-1) # remove dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) # remove dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) # remove dummy col

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

In [50]:
bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)

In [51]:
structures.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,e_neg,rad,n_bonds,bond_lengths_mean
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.38,1,1.091953
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.38,1,1.091952
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.75,3,1.017195
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.38,1,1.01719
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.38,1,1.017187
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.38,1,1.017208
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.73,2,0.962107


In [52]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])

    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)
test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [53]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,bond_lengths_mean_x,atom_index_y,atom_1,x_1,y_1,z_1,e_neg_y,rad_y,n_bonds_y,bond_lengths_mean_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201,...,1.091953,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336,...,1.091953,2,H,1.011731,1.463751,0.000277,2.2,0.38,1,1.091952
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387,...,1.091953,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393,...,1.091953,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013,...,1.091952,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,-11.0317,0.352932,2.85856,-3.43395,...,1.091952,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,-11.0324,0.352943,2.85853,-3.43387,...,1.091952,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,83.0241,0.254634,1.25856,0.272012,...,1.091946,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,-11.0319,0.352943,2.85856,-3.43393,...,1.091946,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,83.0243,0.254628,1.25856,0.272012,...,1.091948,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195


In [54]:
mulliken = mulliken.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
train = train.merge(mulliken, on=['molecule_name', 'atom_index_0'])
mulliken = mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
train = train.merge(mulliken, on=['molecule_name', 'atom_index_1'])

test_mulliken = test_mulliken.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
test = test.merge(test_mulliken, on=['molecule_name', 'atom_index_0'])
test_mulliken = test_mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
test = test.merge(test_mulliken, on=['molecule_name', 'atom_index_1'])

In [55]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1'],
      dtype='object')

In [56]:
test.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'fc',
       'sd', 'pso', 'dso', 'atom_index_x', 'atom_0', 'x_0', 'y_0', 'z_0',
       'e_neg_x', 'rad_x', 'n_bonds_x', 'bond_lengths_mean_x', 'atom_index_y',
       'atom_1', 'x_1', 'y_1', 'z_1', 'e_neg_y', 'rad_y', 'n_bonds_y',
       'bond_lengths_mean_y', 'mulliken_charge_0', 'mulliken_charge_1'],
      dtype='object')

In [57]:
external_data_0 = pd.DataFrame(data={
    'atom_0': ['C', 'H', 'N', 'O', 'F'],
    'atom_0_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_0': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_0': [4, 1, 5, 6, 7]
})

external_data_1 = pd.DataFrame(data={
    'atom_1': ['C', 'H', 'N', 'O', 'F'],
    'atom_1_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_1': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_1': [4, 1, 5, 6, 7]
})

def distance_features(df):
    df['dist'] = np.linalg.norm(df[['x_0', 'y_0', 'z_0']].values - df[['x_1', 'y_1', 'z_1']].values, axis=1)
    df['j_bond'] = df['type'].str[2:]
    df['j_type'] = df['type'].str[:2]
    df['mu_0'] = np.sqrt(df['x_0'].values**2 + df['y_0'].values**2 + df['z_0'].values**2)
    df['mu_1'] = np.sqrt(df['x_1'].values**2 + df['y_1'].values**2 + df['z_1'].values**2)
    df = df.merge(external_data_0, on='atom_0', how='left')
    df = df.merge(external_data_1, on='atom_1', how='left')
    df['delta_en'] = (df['atom_0_en'] - df['atom_1_en']).abs()
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df['molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df['molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df['molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df['molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df['molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df['molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df['molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df['molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df['molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df['molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df['molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df['molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df['molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df['molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df['molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df['molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df['molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df['molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df['molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df['molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df['molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df['molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df['molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df['molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df['molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df['molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df['molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df['molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df['molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df['molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df['molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df['molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df['molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df['molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df['molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df['molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df['molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df['molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df = reduce_mem_usage(df)
    return df

In [58]:
train = distance_features(train)
train.head(10)

Mem. usage decreased to 95.84 Mb (70.8% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,...,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203


In [59]:
test = distance_features(test)
test = test.drop(['fc', 'sd', 'pso', 'dso'], axis=1)
test.head()

Mem. usage decreased to 186.14 Mb (67.4% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_index_x,atom_0,x_0,y_0,z_0,...,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff
0,4659076,dsgdb9nsd_000004,2,0,2JHC,2,H,-1.662109,0.0,1.0,...,0.469727,0.847656,-1.413086,0.375,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336
1,4659079,dsgdb9nsd_000004,3,0,1JHC,3,H,1.662109,0.0,1.0,...,1.0,0.847656,-0.214233,0.79834,1.662109,1.0625,0.0,1.0,0.692383,-0.369873
2,4659077,dsgdb9nsd_000004,2,1,1JHC,2,H,-1.662109,0.0,1.0,...,1.0,0.847656,-0.214233,0.79834,1.662109,1.0625,0.0,1.0,0.692383,-0.369873
3,4659080,dsgdb9nsd_000004,3,1,2JHC,3,H,1.662109,0.0,1.0,...,0.469727,0.847656,-1.413086,0.375,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336
4,4659078,dsgdb9nsd_000004,2,3,3JHH,2,H,-1.662109,0.0,1.0,...,1.0,,,,3.324219,3.324219,0.0,1.0,,


In [60]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1', 'dist', 'j_bond', 'j_type',
       'mu_0', 'mu_1', 'atom_0_en', 'atomic_mass_0', 'valence_electrons_0',
       'atom_1_en', 'atomic_mass_1', 'valence_electrons_1', 'delta_en',
       'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min',
       'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_s

In [61]:
# PCA + SVD
# get the principal eigenvector of molecule axis
def PCA_SVD(a):
  a_mean = a.mean(axis=1)
  #normalise
  a = (a.T - a_mean.T).T
  u, s, v = svd(a.T)
  return(v[0])

# get the plane with minimum sum distance from the nuclei
def get_plane(a):
  a = np.reshape(a, (a.shape[0], -1))
  centroid = a.mean(axis=1)
  #normalise
  x = a - centroid[:,np.newaxis]
  m = np.dot(x, x.T)
  return(centroid, svd(m)[0][:,-1])

In [62]:
def bond_features(df):
  size_list = []
  flatness = []
  bond_angle_plane = []
  bond_angle_axis = []
  diheral_angle = []

  molecule_names = np.unique(df.molecule_name.values.flatten())

  for i in tqdm(range(len(molecule_names))):
    molecule = molecule_names[i]

    # get structure and bond information for each molecule
    struct = structures.loc[structures.molecule_name==molecule, :]
    bonds = df.loc[df.molecule_name==molecule, :]

    # number of constituent atoms
    size = len(struct)
    size_list.extend(np.full(len(bonds), size))

    # nuclei coordinates
    coords = np.column_stack([struct.x.values, struct.y.values, struct.z.values]).T

    # principal axis of molecular alignment
    axis_vector = PCA_SVD(coords)

    # get flatness metric and plane angles for binds if nuclei > 3
    if size > 3:
      coords = coords - coords.mean()
      # get best fitting 2D plane
      ctr, norm = get_plane(coords)

      # calculate distances of each atom from plane
      dists = np.zeros(size)
      for j in range(size):
        dists[j] = np.dot(norm, coords[:, j] - ctr)

      dist = abs(dists).sum()/len(dists)
      flatness.extend(np.full(len(bonds), dist))

      # get angle subtented by each atomic bind and plane
      for j in range(len(bonds)):
          # get atom index numbers for this bond
          atom1 = bonds.atom_index_0.values[j]
          atom2 = bonds.atom_index_1.values[j]

          # get 3D coords
          atom1_coords = coords[:, atom1]
          atom2_coords = coords[:, atom2]

          # bond vector
          atom_vec = np.array([atom1_coords[0] - atom2_coords[0],
                                atom1_coords[1] - atom2_coords[1],
                                atom1_coords[2] - atom2_coords[2]])

          # get bond angles
          angle = np.dot(norm, atom_vec)/(np.linalg.norm(norm)*np.linalg.norm(atom_vec))
          axis_angle = np.dot(axis_vector, atom_vec)/(np.linalg.norm(norm)*np.linalg.norm(atom_vec))

          angle = math.degrees(np.arccos(angle))
          axis_angle = math.degrees(np.arccos(axis_angle))

          if angle > 90:
              angle = 180 - angle

          if axis_angle > 90:
              axis_angle = 180 - axis_angle

          angle = 90 - angle
          axis_angle = 90 - axis_angle
          bond_angle_plane.append(angle)
          bond_angle_axis.append(axis_angle)

    else:
      flatness.extend(np.full(len(bonds), np.nan))
      bond_angle_plane.extend(np.full(len(bonds), np.nan))

      for j in range(len(bonds)):
        # get atom index numbers for this bond
        atom1 = bonds.atom_index_0.values[j]
        atom2 = bonds.atom_index_1.values[j]

        # get 3D coords
        atom1_coords = coords[:, atom1]
        atom2_coords = coords[:, atom2]

        # bond vector
        atom_vec = np.array([atom1_coords[0] - atom2_coords[0],
                              atom1_coords[1] - atom2_coords[1],
                              atom1_coords[2] - atom2_coords[2]])

        # get bond angles
        axis_angle = np.dot(axis_vector, atom_vec)/(np.linalg.norm(axis_vector)*np.linalg.norm(atom_vec))

        # standardise to degrees <= 90
        axis_angle = math.degrees(np.arccos(axis_angle))

        if axis_angle > 90:
            axis_angle = 180 - axis_angle

        axis_angle = 90 - axis_angle
        bond_angle_axis.append(axis_angle)

  df['num_atoms'] = np.asarray(size_list)
  df['flatness'] = np.asarray(flatness)
  df['bond_angle_plane'] = np.asarray(bond_angle_plane)
  df['bond_angle_axis'] = np.asarray(bond_angle_axis)

  return df

In [63]:
train = bond_features(train)
train.head(10)

  0%|          | 0/10398 [00:00<?, ?it/s]

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,num_atoms,flatness,bond_angle_plane,bond_angle_axis
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,5,0.443764,52.084398,34.460611
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,5,0.443764,50.858665,36.069177
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,5,0.443764,13.909861,35.642512
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,5,0.443764,13.124885,34.877747
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,...,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,5,0.443764,73.351062,0.804265
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,39.073948,44.696355
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,19.131573,45.838738
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,5,0.443764,20.124219,44.160617
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203,5,0.443764,37.879628,45.292858
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,16.634444,0.38238


In [64]:
test = bond_features(test)
test.head(10)

  0%|          | 0/16057 [00:00<?, ?it/s]

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_index_x,atom_0,x_0,y_0,z_0,...,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,num_atoms,flatness,bond_angle_plane,bond_angle_axis
0,4659076,dsgdb9nsd_000004,2,0,2JHC,2,H,-1.662109,0.0,1.0,...,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336,4,0.0,0.0,90.0
1,4659079,dsgdb9nsd_000004,3,0,1JHC,3,H,1.662109,0.0,1.0,...,1.662109,1.0625,0.0,1.0,0.692383,-0.369873,4,0.0,0.0,90.0
2,4659077,dsgdb9nsd_000004,2,1,1JHC,2,H,-1.662109,0.0,1.0,...,1.662109,1.0625,0.0,1.0,0.692383,-0.369873,4,0.0,0.0,90.0
3,4659080,dsgdb9nsd_000004,3,1,2JHC,3,H,1.662109,0.0,1.0,...,1.662109,1.0625,-1.199219,0.469727,0.692383,-1.569336,4,0.0,0.0,90.0
4,4659078,dsgdb9nsd_000004,2,3,3JHH,2,H,-1.662109,0.0,1.0,...,3.324219,3.324219,0.0,1.0,,,4,0.0,0.0,90.0
5,4659105,dsgdb9nsd_000015,3,0,1JHC,3,H,1.004883,1.810547,0.004658,...,1.973633,1.092773,-0.009476,0.991211,0.938477,-0.163818,9,0.396373,53.959417,3.242966
6,4659109,dsgdb9nsd_000015,4,0,1JHC,4,H,-0.546875,1.792969,-0.872559,...,1.973633,1.092773,-0.009476,0.991211,0.938477,-0.163818,9,0.396373,53.984077,3.218983
7,4659112,dsgdb9nsd_000015,5,0,1JHC,5,H,-0.529785,1.722656,0.911133,...,1.973633,1.092773,0.0,1.0,0.938477,-0.154419,9,0.396373,0.005501,51.26138
8,4659114,dsgdb9nsd_000015,6,0,3JHC,6,H,0.139893,-0.256104,-2.050781,...,1.973633,1.092773,-1.547852,0.413818,0.938477,-1.702148,9,0.396373,19.746777,65.364682
9,4659118,dsgdb9nsd_000015,7,0,3JHC,7,H,1.692383,-0.238647,-1.174805,...,1.973633,1.092773,-1.546875,0.414062,0.938477,-1.701172,9,0.396373,19.722896,65.361512


In [65]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1', 'dist', 'j_bond', 'j_type',
       'mu_0', 'mu_1', 'atom_0_en', 'atomic_mass_0', 'valence_electrons_0',
       'atom_1_en', 'atomic_mass_1', 'valence_electrons_1', 'delta_en',
       'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min',
       'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_s

In [66]:
j_types = train['type'].unique()

In [67]:
def select_features_pearson(j_type):
    df = train[train['type'] == j_type]

    non_feature_columns = ['scalar_coupling_constant', 'fc', 'type', 'id', 'molecule_name']
    atom_index_columns = ['atom_index_0', 'atom_index_1', 'atom_index_x', 'atom_index_y']

    X = df.select_dtypes(include=[np.number]).drop(non_feature_columns + atom_index_columns, axis=1, errors='ignore')
    y = df['scalar_coupling_constant']

    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns)

    X['target'] = y
    correlation_matrix = X.corr(method='pearson')
    target_correlation = correlation_matrix['target'].drop('target').sort_values(ascending=False)

    X.drop(columns='target', inplace=True)

    features = [(feature, corr) for feature, corr in target_correlation.items()]

    return features

In [68]:
selected_features = {}
for j_type in tqdm(j_types):
  selected_features[j_type] = select_features_pearson(j_type)

selected_features

  0%|          | 0/8 [00:00<?, ?it/s]

{'1JHC': [('molecule_atom_1_dist_std_diff', 0.035653514386119474),
  ('molecule_atom_1_dist_std', 0.03242600041970101),
  ('molecule_atom_index_0_y_1_max', 0.027359339307941703),
  ('y_0', 0.024714616598493937),
  ('molecule_atom_index_0_y_1_mean', 0.022921478717468878),
  ('y_1', 0.02162033769084715),
  ('molecule_atom_index_0_y_1_std', 0.018998752049341257),
  ('z_0', 0.016626233545529952),
  ('sd', 0.015971271982688107),
  ('z_1', 0.014393984480519581),
  ('bond_lengths_mean_y', 0.014106651864461405),
  ('mulliken_charge_1', 0.013115222628711246),
  ('mu_0', 0.011996392655290294),
  ('molecule_atom_index_0_dist_max_div', 0.011565815632044852),
  ('molecule_atom_index_0_dist_max_diff', 0.010240029453531444),
  ('bond_angle_axis', 0.00997315064575951),
  ('molecule_atom_index_0_dist_max', 0.009496989301081714),
  ('molecule_atom_index_0_dist_mean_div', 0.008327222204831381),
  ('mu_1', 0.00809173482518032),
  ('molecule_atom_index_0_dist_mean_diff', 0.006866255208771129),
  ('atom_0_c

In [77]:
threshold = 0.05
correlation_values = [abs(corr) for _, corr in selected_features['1JHC']]
median_corr = np.median(correlation_values)
std_corr = np.std(correlation_values)
if median_corr - std_corr > 0:
    threshold = median_corr - std_corr

threshold

0.05

In [113]:
drop_features = {}

for j_type, features in selected_features.items():
  '''correlation_values = [abs(corr) for _, corr in features if not pd.isnull(corr)]
  if correlation_values:
      median_corr = np.median(correlation_values)
      std_corr = np.std(correlation_values)
      threshold = max(0.01, median_corr - (0.5 * std_corr))
  else:
      continue'''

  drop_features[j_type] = set()
  for feature, correlation in features:
    if pd.isnull(correlation):
        drop_features[j_type].add(feature)

drop_features

{'1JHC': {'atom_0_en',
  'atom_1_en',
  'atomic_mass_0',
  'atomic_mass_1',
  'delta_en',
  'e_neg_x',
  'e_neg_y',
  'molecule_atom_index_0_dist_min_diff',
  'molecule_atom_index_0_dist_min_div',
  'n_bonds_x',
  'rad_x',
  'rad_y',
  'valence_electrons_0',
  'valence_electrons_1'},
 '2JHH': {'atom_0_en',
  'atom_1_en',
  'atomic_mass_0',
  'atomic_mass_1',
  'delta_en',
  'e_neg_x',
  'e_neg_y',
  'n_bonds_x',
  'n_bonds_y',
  'rad_x',
  'rad_y',
  'valence_electrons_0',
  'valence_electrons_1'},
 '1JHN': {'atom_0_en',
  'atom_1_en',
  'atomic_mass_0',
  'atomic_mass_1',
  'delta_en',
  'e_neg_x',
  'e_neg_y',
  'molecule_atom_index_0_dist_min_diff',
  'molecule_atom_index_0_dist_min_div',
  'n_bonds_x',
  'rad_x',
  'rad_y',
  'valence_electrons_0',
  'valence_electrons_1'},
 '2JHN': {'atom_0_en',
  'atom_1_en',
  'atomic_mass_0',
  'atomic_mass_1',
  'delta_en',
  'e_neg_x',
  'e_neg_y',
  'n_bonds_x',
  'rad_x',
  'rad_y',
  'valence_electrons_0',
  'valence_electrons_1'},
 '2JHC'

In [114]:
for j_type in j_types:
  print(f'number of remaining features for {j_type}: {len(selected_features[j_type]) - len(drop_features[j_type])}')

number of remaining features for 1JHC: 63
number of remaining features for 2JHH: 64
number of remaining features for 1JHN: 63
number of remaining features for 2JHN: 65
number of remaining features for 2JHC: 65
number of remaining features for 3JHH: 64
number of remaining features for 3JHC: 65
number of remaining features for 3JHN: 65


In [84]:
def encode(df):
  df = OrdinalEncoder.fit_transform(OrdinalEncoder(df), df)
  return df
def imp(df):
  df = SimpleImputer.fit_transform(SimpleImputer(df), df)
  return df

In [70]:
data_1JHC = train[train['type'] == '1JHC']
data_1JHN = train[train['type'] == '1JHN']
data_2JHH = train[train['type'] == '2JHH']
data_2JHN = train[train['type'] == '2JHN']
data_2JHC = train[train['type'] == '2JHC']
data_3JHH = train[train['type'] == '3JHH']
data_3JHN = train[train['type'] == '3JHN']
data_3JHC = train[train['type'] == '3JHC']

In [73]:
train_1JHC, val_1JHC = train_test_split(data_1JHC, shuffle=False, random_state=47)

train_1JHC_molecules = train_1JHC['molecule_name'].unique()
val_1JHC_molecules = np.delete(val_1JHC['molecule_name'].unique(), 0)

train_1JHC = train_1JHC[train_1JHC['molecule_name'].isin(train_1JHC_molecules)]
val_1JHC = val_1JHC[val_1JHC['molecule_name'].isin(val_1JHC_molecules)]

train_1JHN, val_1JHN = train_test_split(data_1JHN, shuffle=False, random_state=47)

train_1JHN_molecules = train_1JHN['molecule_name'].unique()
val_1JHN_molecules = np.delete(val_1JHN['molecule_name'].unique(), 0)

train_1JHN = train_1JHN[train_1JHN['molecule_name'].isin(train_1JHN_molecules)]
val_1JHN = val_1JHN[val_1JHN['molecule_name'].isin(val_1JHN_molecules)]

train_2JHH, val_2JHH = train_test_split(data_2JHH, shuffle=False, random_state=47)

train_2JHH_molecules = train_2JHH['molecule_name'].unique()
val_2JHH_molecules = np.delete(val_2JHH['molecule_name'].unique(), 0)

train_2JHH = train_2JHH[train_2JHH['molecule_name'].isin(train_2JHH_molecules)]
val_2JHH = val_2JHH[val_2JHH['molecule_name'].isin(val_2JHH_molecules)]

train_2JHN, val_2JHN = train_test_split(data_2JHN, shuffle=False, random_state=47)

train_2JHN_molecules = train_2JHN['molecule_name'].unique()
val_2JHN_molecules = np.delete(val_2JHN['molecule_name'].unique(), 0)

train_2JHN = train_2JHN[train_2JHN['molecule_name'].isin(train_2JHN_molecules)]
val_2JHN = val_2JHN[val_2JHN['molecule_name'].isin(val_2JHN_molecules)]

train_2JHC, val_2JHC = train_test_split(data_2JHC, shuffle=False, random_state=47)

train_2JHC_molecules = train_2JHC['molecule_name'].unique()
val_2JHC_molecules = np.delete(val_2JHC['molecule_name'].unique(), 0)

train_2JHC = train_2JHC[train_2JHC['molecule_name'].isin(train_2JHC_molecules)]
val_2JHC = val_2JHC[val_2JHC['molecule_name'].isin(val_2JHC_molecules)]

train_3JHH, val_3JHH = train_test_split(data_3JHH, shuffle=False, random_state=47)

train_3JHH_molecules = train_3JHH['molecule_name'].unique()
val_3JHH_molecules = np.delete(val_3JHH['molecule_name'].unique(), 0)

train_3JHH = train_3JHH[train_3JHH['molecule_name'].isin(train_3JHH_molecules)]
val_3JHH = val_3JHH[val_3JHH['molecule_name'].isin(val_3JHH_molecules)]

train_3JHC, val_3JHC = train_test_split(data_3JHC, shuffle=False, random_state=47)

train_3JHC_molecules = train_3JHC['molecule_name'].unique()
val_3JHC_molecules = np.delete(val_3JHC['molecule_name'].unique(), 0)

train_3JHC = train_3JHC[train_3JHC['molecule_name'].isin(train_3JHC_molecules)]
val_3JHC = val_3JHC[val_3JHC['molecule_name'].isin(val_3JHC_molecules)]

train_3JHN, val_3JHN = train_test_split(data_3JHN, shuffle=False, random_state=47)

train_3JHN_molecules = train_3JHN['molecule_name'].unique()
val_3JHN_molecules = np.delete(val_3JHN['molecule_name'].unique(), 0)

train_3JHN = train_3JHN[train_3JHN['molecule_name'].isin(train_3JHN_molecules)]
val_3JHN = val_3JHN[val_3JHN['molecule_name'].isin(val_3JHN_molecules)]

In [143]:
features_1JHC = [feature for feature,_ in selected_features['1JHC'] if feature not in drop_features['1JHC']]
features_1JHN = [feature for feature,_ in selected_features['1JHN'] if feature not in drop_features['1JHN']]
features_2JHH = [feature for feature,_ in selected_features['2JHH'] if feature not in drop_features['2JHH']]
features_2JHN = [feature for feature,_ in selected_features['2JHN'] if feature not in drop_features['2JHN']]
features_2JHC = [feature for feature,_ in selected_features['2JHC'] if feature not in drop_features['2JHC']]
features_3JHH = [feature for feature,_ in selected_features['3JHH'] if feature not in drop_features['3JHH']]
features_3JHC = [feature for feature,_ in selected_features['3JHC'] if feature not in drop_features['3JHC']]
features_3JHN = [feature for feature,_ in selected_features['3JHN'] if feature not in drop_features['3JHN']]

In [144]:
X_train_1JHC = train_1JHC[features_1JHC]
X_train_1JHN = train_1JHN[features_1JHN]
X_train_2JHH = train_2JHH[features_2JHH]
X_train_2JHN = train_2JHN[features_2JHN]
X_train_2JHC = train_2JHC[features_2JHC]
X_train_3JHH = train_3JHH[features_3JHH]
X_train_3JHC = train_3JHC[features_3JHC]
X_train_3JHN = train_3JHN[features_3JHN]

'''X_train_1JHC = train_1JHC
X_train_1JHN = train_1JHN
X_train_2JHH = train_2JHH
X_train_2JHN = train_2JHN
X_train_2JHC = train_2JHC
X_train_3JHH = train_3JHH
X_train_3JHC = train_3JHC
X_train_3JHN = train_3JHN'''

y_train_1JHC = train_1JHC['scalar_coupling_constant']
y_train_1JHN = train_1JHN['scalar_coupling_constant']
y_train_2JHH = train_2JHH['scalar_coupling_constant']
y_train_2JHN = train_2JHN['scalar_coupling_constant']
y_train_2JHC = train_2JHC['scalar_coupling_constant']
y_train_3JHH = train_3JHH['scalar_coupling_constant']
y_train_3JHC = train_3JHC['scalar_coupling_constant']
y_train_3JHN = train_3JHN['scalar_coupling_constant']

X_val_1JHC = val_1JHC[features_1JHC]
X_val_1JHN = val_1JHN[features_1JHN]
X_val_2JHH = val_2JHH[features_2JHH]
X_val_2JHN = val_2JHN[features_2JHN]
X_val_2JHC = val_2JHC[features_2JHC]
X_val_3JHH = val_3JHH[features_3JHH]
X_val_3JHC = val_3JHC[features_3JHC]
X_val_3JHN = val_3JHN[features_3JHN]

'''X_val_1JHC = val_1JHC
X_val_1JHN = val_1JHN
X_val_2JHH = val_2JHH
X_val_2JHN = val_2JHN
X_val_2JHC = val_2JHC
X_val_3JHH = val_3JHH
X_val_3JHC = val_3JHC
X_val_3JHN = val_3JHN'''

y_val_1JHC = val_1JHC['scalar_coupling_constant']
y_val_1JHN = val_1JHN['scalar_coupling_constant']
y_val_2JHH = val_2JHH['scalar_coupling_constant']
y_val_2JHN = val_2JHN['scalar_coupling_constant']
y_val_2JHC = val_2JHC['scalar_coupling_constant']
y_val_3JHH = val_3JHH['scalar_coupling_constant']
y_val_3JHC = val_3JHC['scalar_coupling_constant']
y_val_3JHN = val_3JHN['scalar_coupling_constant']

In [148]:
def eval_metric(y_true, y_pred):
  metric_value = np.sqrt(np.mean(np.square(y_true - y_pred)))
  return 'rmse', metric_value, False

def score_metric(y_true, y_pred, sample_weight):
  return np.log(np.abs(y_true - y_pred).mean())

In [117]:
features_1JHC

['molecule_atom_1_dist_std_diff',
 'molecule_atom_1_dist_std',
 'molecule_atom_index_0_y_1_max',
 'y_0',
 'molecule_atom_index_0_y_1_mean',
 'y_1',
 'molecule_atom_index_0_y_1_std',
 'z_0',
 'sd',
 'z_1',
 'bond_lengths_mean_y',
 'mulliken_charge_1',
 'mu_0',
 'molecule_atom_index_0_dist_max_div',
 'molecule_atom_index_0_dist_max_diff',
 'bond_angle_axis',
 'molecule_atom_index_0_dist_max',
 'molecule_atom_index_0_dist_mean_div',
 'mu_1',
 'molecule_atom_index_0_dist_mean_diff',
 'atom_0_couples_count',
 'molecule_atom_index_0_dist_mean',
 'molecule_dist_mean',
 'n_bonds_y',
 'molecule_atom_index_0_y_1_max_diff',
 'molecule_atom_index_0_dist_std_diff',
 'molecule_atom_index_0_dist_std_div',
 'dso',
 'mulliken_charge_0',
 'molecule_atom_index_0_dist_std',
 'pso',
 'molecule_atom_index_1_dist_std_diff',
 'molecule_atom_index_1_dist_min_diff',
 'molecule_atom_index_1_dist_min_div',
 'molecule_atom_index_1_dist_std_div',
 'molecule_atom_index_1_dist_std',
 'molecule_atom_1_dist_min_diff',


In [124]:
X_val_1JHC.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,num_atoms,flatness,bond_angle_plane,bond_angle_axis
374780,374780,dsgdb9nsd_012706,8,0,1JHC,86.375,84.625,0.180908,0.745605,0.816895,...,2.261719,1.091797,-0.001246,0.999023,0.754883,-0.337646,18,0.764094,50.739832,8.279127
374781,374786,dsgdb9nsd_012706,9,0,1JHC,81.4375,79.625,0.169434,0.872559,0.73584,...,2.261719,1.091797,-0.001886,0.998047,0.754883,-0.338135,18,0.764094,19.023345,27.137499
374782,374791,dsgdb9nsd_012706,10,0,1JHC,82.875,81.125,0.1698,0.839844,0.75,...,2.261719,1.091797,-0.00137,0.998535,0.754883,-0.337646,18,0.764094,40.665098,36.801487
374788,374796,dsgdb9nsd_012706,11,1,1JHC,85.3125,84.1875,0.023697,-0.216431,1.301758,...,2.261719,1.091797,-0.010101,0.990723,0.754883,-0.346436,18,0.764094,51.203835,9.409904
374802,374825,dsgdb9nsd_012706,16,6,1JHC,86.375,85.1875,0.047638,-0.16394,1.333008,...,2.261719,1.091797,-0.004246,0.996094,0.754883,-0.340576,18,0.764094,83.006585,6.398766


In [125]:
y_val_1JHC

374780     86.3750
374781     81.4375
374782     82.8750
374788     85.3125
374802     86.3750
            ...   
499964    117.5000
499971    107.3125
499972    113.3750
499986    101.8750
499990     85.1875
Name: scalar_coupling_constant, Length: 19975, dtype: float16

In [151]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_1JHC, y_val_1JHC)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_1JHC = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_1JHC = RandomizedSearchCV(
    estimator=clf_1JHC,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_1JHC.fit(X_train_1JHC, y_train_1JHC, **fit_params)

final_params_1JHC = gs_1JHC.best_params_
final_params_1JHC.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_1JHC_final = lgb.LGBMRegressor(**final_params_1JHC)

clf_1JHC_final.fit(X_train_1JHC, y_train_1JHC)

y_pred_1JHC = clf_1JHC_final.predict(X_val_1JHC)

pred_vs_actual_1JHC = pd.DataFrame(data={
    'predictions': y_pred_1JHC,
    'actual': y_val_1JHC
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[999]	valid's l2: 6.53024	valid's rmse: 2.55543
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 5.05236	valid's rmse: 2.24774
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 5.22877	valid's rmse: 2.28665
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 6.04289	valid's rmse: 2.45823
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 4.77893	valid's rmse: 2.18608
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 4.97425	valid's rmse: 2.2303
Training until validation scores don't improve for 30 

In [150]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_1JHN, y_val_1JHN)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_1JHN = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_1JHN = RandomizedSearchCV(
    estimator=clf_1JHN,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_1JHN.fit(X_train_1JHN, y_train_1JHN, **fit_params)

final_params_1JHN = gs_1JHN.best_params_
final_params_1JHN.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_1JHN_final = lgb.LGBMRegressor(**final_params_1JHN)

clf_1JHN_final.fit(X_train_1JHN, y_train_1JHN)

y_pred_1JHN = clf_1JHN_final.predict(X_val_1JHN)

pred_vs_actual_1JHN = pd.DataFrame(data={
    'predictions': y_pred_1JHN,
    'actual': y_val_1JHN
})

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[702]	valid's l2: 2.27396	valid's rmse: 1.50796
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.25596	valid's rmse: 1.1207
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[590]	valid's l2: 1.93712	valid's rmse: 1.3918
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.85282	valid's rmse: 1.36118
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.18849	valid's rmse: 1.09018
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[992]	valid's l2: 1.71856	valid's rmse: 1.31094
Training until validation scores don't improve for 30 rounds
Early stopping, best i

In [152]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_2JHH, y_val_2JHH)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_2JHH = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_2JHH = RandomizedSearchCV(
    estimator=clf_2JHH,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_2JHH.fit(X_train_2JHH, y_train_2JHH, **fit_params)

final_params_2JHH = gs_2JHH.best_params_
final_params_2JHH.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_2JHH_final = lgb.LGBMRegressor(**final_params_2JHH)

clf_2JHH_final.fit(X_train_2JHH, y_train_2JHH)

y_pred_2JHH = clf_2JHH_final.predict(X_val_2JHH)

pred_vs_actual_2JHH = pd.DataFrame(data={
    'predictions': y_pred_2JHH,
    'actual': y_val_2JHH
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.566877	valid's rmse: 0.752913
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[998]	valid's l2: 0.436094	valid's rmse: 0.660374
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.451709	valid's rmse: 0.672093
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[998]	valid's l2: 0.530732	valid's rmse: 0.728513
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.40714	valid's rmse: 0.638076
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[997]	valid's l2: 0.408738	valid's rmse: 0.639326
Training until validation scores don't impro

In [153]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_2JHN, y_val_2JHN)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_2JHN = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_2JHN = RandomizedSearchCV(
    estimator=clf_2JHN,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_2JHN.fit(X_train_2JHN, y_train_2JHN, **fit_params)

final_params_2JHN = gs_2JHN.best_params_
final_params_2JHN.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_2JHN_final = lgb.LGBMRegressor(**final_params_2JHN)

clf_2JHN_final.fit(X_train_2JHN, y_train_2JHN)

y_pred_2JHN = clf_2JHN_final.predict(X_val_2JHN)

pred_vs_actual_2JHN = pd.DataFrame(data={
    'predictions': y_pred_2JHN,
    'actual': y_val_2JHN
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.21849	valid's rmse: 1.10385
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.949098	valid's rmse: 0.974217
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[998]	valid's l2: 0.966449	valid's rmse: 0.983081
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.11497	valid's rmse: 1.05592
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[999]	valid's l2: 0.832401	valid's rmse: 0.91236
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[999]	valid's l2: 0.860106	valid's rmse: 0.927419
Training until validation scores don't improve f

In [155]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_2JHC, y_val_2JHC)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_2JHC = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_2JHC = RandomizedSearchCV(
    estimator=clf_2JHC,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_2JHC.fit(X_train_2JHC, y_train_2JHC, **fit_params)

final_params_2JHC = gs_2JHC.best_params_
final_params_2JHC.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_2JHC_final = lgb.LGBMRegressor(**final_params_2JHC)

clf_2JHC_final.fit(X_train_2JHC, y_train_2JHC)

y_pred_2JHC = clf_2JHC_final.predict(X_val_2JHC)

pred_vs_actual_2JHC = pd.DataFrame(data={
    'predictions': y_pred_2JHC,
    'actual': y_val_2JHC
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.7535	valid's rmse: 1.3242
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.47581	valid's rmse: 1.21483
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.5769	valid's rmse: 1.25575
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[998]	valid's l2: 1.66873	valid's rmse: 1.29179
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.41613	valid's rmse: 1.19001
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.502	valid's rmse: 1.22556
Training until validation scores don't improve for 30 roun

In [156]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_3JHH, y_val_3JHH)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_3JHH = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_3JHH = RandomizedSearchCV(
    estimator=clf_3JHH,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_3JHH.fit(X_train_3JHH, y_train_3JHH, **fit_params)

final_params_3JHH = gs_3JHH.best_params_
final_params_3JHH.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_3JHH_final = lgb.LGBMRegressor(**final_params_3JHH)

clf_3JHH_final.fit(X_train_3JHH, y_train_3JHH)

y_pred_3JHH = clf_3JHH_final.predict(X_val_3JHH)

pred_vs_actual_3JHH = pd.DataFrame(data={
    'predictions': y_pred_3JHH,
    'actual': y_val_3JHH
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.626031	valid's rmse: 0.791221
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.486618	valid's rmse: 0.69758
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.502022	valid's rmse: 0.708535
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.588667	valid's rmse: 0.767246
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.461285	valid's rmse: 0.679179
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 0.466018	valid's rmse: 0.682655
Training until validation scores don't im

In [157]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_3JHC, y_val_3JHC)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_3JHC = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_3JHC = RandomizedSearchCV(
    estimator=clf_3JHC,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_3JHC.fit(X_train_3JHC, y_train_3JHC, **fit_params)

final_params_3JHC = gs_3JHC.best_params_
final_params_3JHC.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_3JHC_final = lgb.LGBMRegressor(**final_params_3JHC)

clf_3JHC_final.fit(X_train_3JHC, y_train_3JHC)

y_pred_3JHC = clf_3JHC_final.predict(X_val_3JHC)

pred_vs_actual_3JHC = pd.DataFrame(data={
    'predictions': y_pred_3JHC,
    'actual': y_val_3JHC
})

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.69435	valid's rmse: 1.30167
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[999]	valid's l2: 1.41022	valid's rmse: 1.18753
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[997]	valid's l2: 1.45088	valid's rmse: 1.20453
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.66983	valid's rmse: 1.29222
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.414	valid's rmse: 1.18912
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid's l2: 1.42332	valid's rmse: 1.19303
Training until validation scores don't improve for 30 ro

In [158]:
fit_params={'eval_metric' : eval_metric,
            'eval_set' : [(X_val_3JHN, y_val_3JHN)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            'callbacks': [lgb.early_stopping(stopping_rounds=30, verbose=True)]}

param_test ={'num_leaves': [x for x in range(20, 80, 10)],
             'min_child_samples': [x for x in range(200, 400, 20)],
             'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 10],
             'subsample': uniform(loc=0.6, scale=0.4),
             'colsample_bytree': uniform(loc=0.6, scale=0.4),
             'reg_alpha': [0, 1e-1, 1, 5, 10],
             'reg_lambda': [0, 1e-1, 1, 5, 10],
             'learning_rate': uniform(loc=0.01, scale=0.2)}

clf_3JHN = lgb.LGBMRegressor(max_depth=-1, random_state=47, n_jobs=-1, n_estimators=1000, verbose=-1)

gs_3JHN = RandomizedSearchCV(
    estimator=clf_3JHN,
    param_distributions=param_test,
    n_iter=10,
    cv=3,
    refit=True,
    random_state=47,
    scoring='neg_mean_squared_error')

gs_3JHN.fit(X_train_3JHN, y_train_3JHN, **fit_params)

final_params_3JHN = gs_3JHN.best_params_
final_params_3JHN.update({
    'n_estimators': 2000,
    'max_depth': -1,
    'random_state': 47,
    'n_jobs': -1,
    'verbose': -1
})

clf_3JHN_final = lgb.LGBMRegressor(**final_params_3JHN)

clf_3JHN_final.fit(X_train_3JHN, y_train_3JHN)

y_pred_3JHN = clf_3JHN_final.predict(X_val_3JHN)

pred_vs_actual_3JHN = pd.DataFrame(data={
    'predictions': y_pred_3JHN,
    'actual': y_val_3JHN
})

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[562]	valid's l2: 0.431437	valid's rmse: 0.656839
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[842]	valid's l2: 0.348429	valid's rmse: 0.590279
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[775]	valid's l2: 0.420516	valid's rmse: 0.648472
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[994]	valid's l2: 0.399601	valid's rmse: 0.63214
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[978]	valid's l2: 0.325362	valid's rmse: 0.570405
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[970]	valid's l2: 0.388827	valid's rmse: 0.62356
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[792