<a href="https://colab.research.google.com/github/tram-tr/predicting-molecular-properties/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# need authorization code to mount drive
from google.colab import drive
drive.mount('/content/drive')
%env KAGGLE_CONFIG_DIR=/content/drive/My Drive/Colab Notebooks/Kaggle

Mounted at /content/drive
env: KAGGLE_CONFIG_DIR=/content/drive/My Drive/Colab Notebooks/Kaggle


In [None]:
!pip install kaggle
!kaggle competitions download -c champs-scalar-coupling

Downloading champs-scalar-coupling.zip to /content
100% 376M/377M [00:03<00:00, 145MB/s]
100% 377M/377M [00:03<00:00, 110MB/s]


In [None]:
#!unzip /content/champs-scalar-coupling.zip

In [None]:
!pip install ase

Collecting ase
  Downloading ase-3.22.1-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ase
Successfully installed ase-3.22.1


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import math
from numpy.linalg import svd
import ase
from ase import Atoms
import ase.visualize
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
warnings.filterwarnings('ignore')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv('train.csv', nrows=500_000)
structures = pd.read_csv('structures.csv')
contributions = pd.read_csv('scalar_coupling_contributions.csv')

In [None]:
def visualize(molecule_name):
  # select a molecule
  molecule = structures[structures['molecule_name'] == molecule_name]

  # get atomic coordinates
  xcart = molecule.iloc[:, 3:].values

  # get atomic symbols
  symbols = molecule.iloc[:, 2].values

  # display molecule
  system = Atoms(positions=xcart, symbols=symbols)

  print('Molecule Name: %s.' %molecule_name)

  return ase.visualize.view(system, viewer="x3d")

In [None]:
train.head(20)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095


In [None]:
structures.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602


In [None]:
contributions.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
0,dsgdb9nsd_000001,1,0,1JHC,83.0224,0.254579,1.25862,0.27201
1,dsgdb9nsd_000001,1,2,2JHH,-11.0347,0.352978,2.85839,-3.4336
2,dsgdb9nsd_000001,1,3,2JHH,-11.0325,0.352944,2.85852,-3.43387
3,dsgdb9nsd_000001,1,4,2JHH,-11.0319,0.352934,2.85855,-3.43393
4,dsgdb9nsd_000001,2,0,1JHC,83.0222,0.254585,1.25861,0.272013


In [None]:
train = pd.merge(train, contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [None]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013


In [None]:
molecule_names = np.unique(train.molecule_name.values.flatten())
atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71}
fudge_factor = 0.05
atomic_radius_fudge = {k:v + fudge_factor for k,v in atomic_radius.items()}
electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}
max_atoms = 28

In [None]:
atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in tqdm(atoms)]
atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]
structures['e_neg'] = atoms_en
structures['rad'] = atoms_rad

  0%|          | 0/2358875 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

In [None]:
structures.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,e_neg,rad
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.77
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.38
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.38
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.38
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.38
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.75
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.38
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.38
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.38
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.73


In [None]:
atom_idx = structures['atom_index'].values
coords = structures[['x', 'y', 'z']].values
molecules = structures['molecule_name'].values
rads = structures['rad'].values
n_rows = np.arange(len(structures))

# calculate bond distances
bonds = np.zeros((len(structures) + 1, max_atoms + 1), dtype=np.int8)
bond_dists = np.zeros((len(structures) + 1, max_atoms + 1), dtype=np.float32)

coords_temp = coords
molecules_temp = molecules
rads_temp = rads

for i in tqdm(range(max_atoms - 1)):
  coords_temp = np.roll(coords_temp, -1, axis=0)
  molecules_temp = np.roll(molecules_temp, -1, axis=0)
  rads_temp = np.roll(rads_temp, -1, axis=0)

  mask = np.where(molecules==molecules_temp, 1, 0)
  dists = np.linalg.norm(coords - coords_temp, axis=1) * mask
  rad_bond = rads + rads_temp

  bond = np.where(np.logical_and(dists > 0.0001, dists < rad_bond), 1, 0)

  source_row = n_rows
  target_row = source_row + i + 1
  target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row)

  source_atom = atom_idx
  target_atom = atom_idx + i + 1
  target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom)

  bonds[(source_row, target_atom)] = bond
  bonds[(target_row, source_atom)] = bond
  bond_dists[(source_row, target_atom)] = dists
  bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) # remove dummy row
bonds = np.delete(bonds, axis=1, obj=-1) # remove dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) # remove dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) # remove dummy col

bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

  0%|          | 0/2358875 [00:00<?, ?it/s]

In [None]:
bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)

In [None]:
structures.head(10)

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,e_neg,rad,n_bonds,bond_lengths_mean
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,2.2,0.38,1,1.091953
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,2.2,0.38,1,1.091952
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
5,dsgdb9nsd_000002,0,N,-0.040426,1.024108,0.062564,3.04,0.75,3,1.017195
6,dsgdb9nsd_000002,1,H,0.017257,0.012545,-0.027377,2.2,0.38,1,1.01719
7,dsgdb9nsd_000002,2,H,0.915789,1.358745,-0.028758,2.2,0.38,1,1.017187
8,dsgdb9nsd_000002,3,H,-0.520278,1.343532,-0.775543,2.2,0.38,1,1.017208
9,dsgdb9nsd_000003,0,O,-0.03436,0.97754,0.007602,3.44,0.73,2,0.962107


In [None]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])

    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

In [None]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,bond_lengths_mean_x,atom_index_y,atom_1,x_1,y_1,z_1,e_neg_y,rad_y,n_bonds_y,bond_lengths_mean_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,83.0224,0.254579,1.25862,0.27201,...,1.091953,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336,...,1.091953,2,H,1.011731,1.463751,0.000277,2.2,0.38,1,1.091952
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387,...,1.091953,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393,...,1.091953,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,83.0222,0.254585,1.25861,0.272013,...,1.091952,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541,-11.0317,0.352932,2.85856,-3.43395,...,1.091952,3,H,-0.540815,1.447527,-0.876644,2.2,0.38,1,1.091946
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548,-11.0324,0.352943,2.85853,-3.43387,...,1.091952,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093,83.0241,0.254634,1.25856,0.272012,...,1.091946,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543,-11.0319,0.352943,2.85856,-3.43393,...,1.091946,4,H,-0.523814,1.437933,0.906397,2.2,0.38,1,1.091948
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095,83.0243,0.254628,1.25856,0.272012,...,1.091948,0,C,-0.012698,1.085804,0.008001,2.55,0.77,4,1.09195


In [None]:
mulliken = pd.read_csv('/content/mulliken_charges.csv')
mulliken = mulliken.rename({'atom_index': 'atom_index_0',
                          'mulliken_charge': 'mulliken_charge_0'}, axis=1)
train = train.merge(mulliken, on=['molecule_name', 'atom_index_0'])
mulliken = mulliken.rename({'atom_index_0': 'atom_index_1',
                          'mulliken_charge_0': 'mulliken_charge_1'}, axis=1)
train = train.merge(mulliken, on=['molecule_name', 'atom_index_1'])

In [None]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1'],
      dtype='object')

In [None]:
external_data_0 = pd.DataFrame(data={
    'atom_0': ['C', 'H', 'N', 'O', 'F'],
    'atom_0_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_0': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_0': [4, 1, 5, 6, 7]
})

external_data_1 = pd.DataFrame(data={
    'atom_1': ['C', 'H', 'N', 'O', 'F'],
    'atom_1_en': [2.55, 2.20, 3.04, 3.44, 3.98],
    'atomic_mass_1': [12.0107, 1.00784, 14.0067, 15.999, 18.9984],
    'valence_electrons_1': [4, 1, 5, 6, 7]
})

def distance_features(df):
    df['dist'] = np.linalg.norm(df[['x_0', 'y_0', 'z_0']].values - df[['x_1', 'y_1', 'z_1']].values, axis=1)
    df['j_bond'] = df['type'].str[2:]
    df['j_type'] = df['type'].str[:2]
    df['mu_0'] = np.sqrt(df['x_0'].values**2 + df['y_0'].values**2 + df['z_0'].values**2)
    df['mu_1'] = np.sqrt(df['x_1'].values**2 + df['y_1'].values**2 + df['z_1'].values**2)
    df = df.merge(external_data_0, on='atom_0', how='left')
    df = df.merge(external_data_1, on='atom_1', how='left')
    df['delta_en'] = (df['atom_0_en'] - df['atom_1_en']).abs()
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df['molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df['molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df['molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df['molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df['molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df['molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df['molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df['molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df['molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df['molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df['molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df['molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df['molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df['molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df['molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df['molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df['molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df['molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df['molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df['molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df['molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df['molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df['molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df['molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df['molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df['molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df['molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df['molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df['molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df['molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df['molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df['molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df['molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df['molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df['molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df['molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df['molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df['molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df = reduce_mem_usage(df)
    return df

In [None]:
train = distance_features(train)
train.head(10)

Mem. usage decreased to 95.84 Mb (70.8% reduction)


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,0.0,1.0,3e-06,-1.091797
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.0,3e-06,-1.091797,3e-06,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,...,1.0,,,,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,7e-06,-1.783203,4e-06,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.0,5e-06,-1.783203,3e-06,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203


In [None]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1', 'dist', 'j_bond', 'j_type',
       'mu_0', 'mu_1', 'atom_0_en', 'atomic_mass_0', 'valence_electrons_0',
       'atom_1_en', 'atomic_mass_1', 'valence_electrons_1', 'delta_en',
       'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min',
       'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_s

In [None]:
# PCA + SVD
# get the principal eigenvector of molecule axis
def PCA_SVD(a):
  a_mean = a.mean(axis=1)
  #normalise
  a = (a.T - a_mean.T).T
  u, s, v = svd(a.T)
  return(v[0])

# get the plane with minimum sum distance from the nuclei
def get_plane(a):
  a = np.reshape(a, (a.shape[0], -1))
  centroid = a.mean(axis=1)
  #normalise
  x = a - centroid[:,np.newaxis]
  m = np.dot(x, x.T)
  return(centroid, svd(m)[0][:,-1])

In [None]:
size_list = []
flatness = []
bond_angle_plane = []
bond_angle_axis = []
diheral_angle = []

for i in tqdm(range(len(molecule_names))):
  molecule = molecule_names[i]

  # get structure and bond information for each molecule
  struct = structures.loc[structures.molecule_name==molecule, :]
  bonds = train.loc[train.molecule_name==molecule, :]

  # number of constituent atoms
  size = len(struct)
  size_list.extend(np.full(len(bonds), size))

  # nuclei coordinates
  coords = np.column_stack([struct.x.values, struct.y.values, struct.z.values]).T

  # principal axis of molecular alignment
  axis_vector = PCA_SVD(coords)

  # get flatness metric and plane angles for binds if nuclei > 3
  if size > 3:
    coords = coords - coords.mean()
    # get best fitting 2D plane
    ctr, norm = get_plane(coords)

    # calculate distances of each atom from plane
    dists = np.zeros(size)
    for j in range(size):
      dists[j] = np.dot(norm, coords[:, j] - ctr)

    dist = abs(dists).sum()/len(dists)
    flatness.extend(np.full(len(bonds), dist))

    # get angle subtented by each atomic bind and plane
    for j in range(len(bonds)):
        # get atom index numbers for this bond
        atom1 = bonds.atom_index_0.values[j]
        atom2 = bonds.atom_index_1.values[j]

        # get 3D coords
        atom1_coords = coords[:, atom1]
        atom2_coords = coords[:, atom2]

        # bond vector
        atom_vec = np.array([atom1_coords[0] - atom2_coords[0],
                              atom1_coords[1] - atom2_coords[1],
                              atom1_coords[2] - atom2_coords[2]])

        # get bond angles
        angle = np.dot(norm, atom_vec)/(np.linalg.norm(norm)*np.linalg.norm(atom_vec))
        axis_angle = np.dot(axis_vector, atom_vec)/(np.linalg.norm(norm)*np.linalg.norm(atom_vec))

        angle = math.degrees(np.arccos(angle))
        axis_angle = math.degrees(np.arccos(axis_angle))

        if angle > 90:
            angle = 180 - angle

        if axis_angle > 90:
            axis_angle = 180 - axis_angle

        angle = 90 - angle
        axis_angle = 90 - axis_angle
        bond_angle_plane.append(angle)
        bond_angle_axis.append(axis_angle)

  else:
    flatness.extend(np.full(len(bonds), np.nan))
    bond_angle_plane.extend(np.full(len(bonds), np.nan))

    for j in range(len(bonds)):
      # get atom index numbers for this bond
      atom1 = bonds.atom_index_0.values[j]
      atom2 = bonds.atom_index_1.values[j]

      # get 3D coords
      atom1_coords = coords[:, atom1]
      atom2_coords = coords[:, atom2]

      # bond vector
      atom_vec = np.array([atom1_coords[0] - atom2_coords[0],
                            atom1_coords[1] - atom2_coords[1],
                            atom1_coords[2] - atom2_coords[2]])

      # get bond angles
      axis_angle = np.dot(axis_vector, atom_vec)/(np.linalg.norm(axis_vector)*np.linalg.norm(atom_vec))

      # standardise to degrees <= 90
      axis_angle = math.degrees(np.arccos(axis_angle))

      if axis_angle > 90:
          axis_angle = 180 - axis_angle

      axis_angle = 90 - axis_angle
      bond_angle_axis.append(axis_angle)


  0%|          | 0/10398 [00:00<?, ?it/s]

In [None]:
train['num_atoms'] = np.asarray(size_list)
train['flatness'] = np.asarray(flatness)
train['bond_angle_plane'] = np.asarray(bond_angle_plane)
train['bond_angle_axis'] = np.asarray(bond_angle_axis)

In [None]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso,...,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,num_atoms,flatness,bond_angle_plane,bond_angle_axis
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-7e-06,1.0,3e-06,-1.091797,5,0.443764,52.084398,34.460611
1,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-5e-06,1.0,3e-06,-1.091797,5,0.443764,50.858665,36.069177
2,7,dsgdb9nsd_000001,3,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,0.0,1.0,3e-06,-1.091797,5,0.443764,13.909861,35.642512
3,9,dsgdb9nsd_000001,4,0,1JHC,84.8125,83.0,0.254639,1.258789,0.271973,...,1.091797,1.091797,-1e-06,1.0,3e-06,-1.091797,5,0.443764,13.124885,34.877747
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257812,-11.03125,0.353027,2.857422,-3.433594,...,1.783203,1.783203,0.0,1.0,1.4e-05,-1.783203,5,0.443764,73.351062,0.804265
5,2,dsgdb9nsd_000001,1,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,39.073948,44.696355
6,5,dsgdb9nsd_000001,2,3,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-3.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,19.131573,45.838738
7,3,dsgdb9nsd_000001,1,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-3.7e-05,1.0,1.4e-05,-1.783203,5,0.443764,20.124219,44.160617
8,6,dsgdb9nsd_000001,2,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.9e-05,1.0,1.4e-05,-1.783203,5,0.443764,37.879628,45.292858
9,8,dsgdb9nsd_000001,3,4,2JHH,-11.257812,-11.03125,0.353027,2.859375,-3.433594,...,1.783203,1.783203,-2.8e-05,1.0,1.4e-05,-1.783203,5,0.443764,16.634444,0.38238


In [None]:
train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'atom_index_x',
       'atom_0', 'x_0', 'y_0', 'z_0', 'e_neg_x', 'rad_x', 'n_bonds_x',
       'bond_lengths_mean_x', 'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1',
       'e_neg_y', 'rad_y', 'n_bonds_y', 'bond_lengths_mean_y',
       'mulliken_charge_0', 'mulliken_charge_1', 'dist', 'j_bond', 'j_type',
       'mu_0', 'mu_1', 'atom_0_en', 'atomic_mass_0', 'valence_electrons_0',
       'atom_1_en', 'atomic_mass_1', 'valence_electrons_1', 'delta_en',
       'molecule_couples', 'molecule_dist_mean', 'molecule_dist_min',
       'molecule_dist_max', 'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_s

In [None]:
j_types = train['type'].unique()

In [None]:
def select_features_spearman(j_type):
    df = train[train['type'] == j_type]

    non_feature_columns = ['scalar_coupling_constant', 'fc', 'type', 'id', 'molecule_name', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1']
    atom_index_columns = ['atom_index_0', 'atom_index_1', 'atom_index_x', 'atom_index_y']

    X = df.select_dtypes(include=[np.number]).drop(non_feature_columns + atom_index_columns, axis=1, errors='ignore')
    y = df['scalar_coupling_constant']

    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=X.columns)

    X['target'] = y
    correlation_matrix = X.corr(method='spearman')
    target_correlation = correlation_matrix['target'].drop('target').sort_values(ascending=False)

    X.drop(columns='target', inplace=True)

    features = [(feature, corr) for feature, corr in target_correlation.items()]

    return features

In [44]:
selected_features = {}
for j_type in tqdm(j_types):
  selected_features[j_type] = select_features_spearman(j_type)

selected_features

  0%|          | 0/8 [00:00<?, ?it/s]

{'1JHC': [('mulliken_charge_1', 0.04429623931467475),
  ('molecule_atom_index_0_y_1_mean_div', 0.02736426158821692),
  ('sd', 0.02613395520935285),
  ('molecule_atom_index_0_dist_mean_div', 0.025345470134838486),
  ('molecule_atom_index_0_dist_mean_diff', 0.024139211617807767),
  ('dso', 0.023924489133243357),
  ('molecule_dist_mean', 0.02325444449508447),
  ('molecule_atom_index_0_dist_mean', 0.02279051855581614),
  ('molecule_atom_index_1_dist_min_diff', 0.02204928877173002),
  ('mu_0', 0.0216950123699737),
  ('bond_lengths_mean_y', 0.020292307147353993),
  ('atom_0_couples_count', 0.01957787286740278),
  ('molecule_atom_1_dist_std_diff', 0.01939537085614888),
  ('molecule_atom_index_1_dist_min_div', 0.01826004879926508),
  ('mu_1', 0.015154741036441705),
  ('molecule_atom_1_dist_std', 0.014606471884146287),
  ('molecule_atom_index_0_y_1_max', 0.01301631931136256),
  ('molecule_atom_index_0_y_1_mean', 0.010742164069401422),
  ('bond_angle_axis', 0.009411644492052312),
  ('molecule_at

In [45]:
threshold = 0.05
correlation_values = [abs(corr) for _, corr in selected_features['1JHC']]
median_corr = np.median(correlation_values)
std_corr = np.std(correlation_values)
if median_corr - std_corr > 0:
    threshold = median_corr - std_corr

threshold

0.05

In [53]:
drop_features = {}

for j_type, features in selected_features.items():
  correlation_values = [abs(corr) for _, corr in features if not pd.isnull(corr)]
  if correlation_values:
      median_corr = np.median(correlation_values)
      std_corr = np.std(correlation_values)
      threshold = max(0.01, median_corr - (0.5 * std_corr))
  else:
      continue

  drop_features[j_type] = set()
  for feature, correlation in features:
    if abs(correlation) < threshold or pd.isnull(correlation):
        drop_features[j_type].add(feature)

drop_features

{'1JHC': {'atom_0_en',
  'atom_1_en',
  'atomic_mass_0',
  'atomic_mass_1',
  'bond_angle_axis',
  'delta_en',
  'e_neg_x',
  'e_neg_y',
  'molecule_atom_1_dist_min_diff',
  'molecule_atom_1_dist_min_div',
  'molecule_atom_index_0_dist_max',
  'molecule_atom_index_0_dist_max_diff',
  'molecule_atom_index_0_dist_max_div',
  'molecule_atom_index_0_dist_min_diff',
  'molecule_atom_index_0_dist_min_div',
  'molecule_atom_index_0_dist_std',
  'molecule_atom_index_0_dist_std_diff',
  'molecule_atom_index_0_dist_std_div',
  'molecule_atom_index_0_x_1_std',
  'molecule_atom_index_0_y_1_max_diff',
  'molecule_atom_index_0_y_1_mean_diff',
  'molecule_atom_index_0_y_1_std',
  'molecule_atom_index_0_z_1_std',
  'molecule_atom_index_1_dist_mean',
  'molecule_atom_index_1_dist_mean_diff',
  'molecule_atom_index_1_dist_mean_div',
  'molecule_atom_index_1_dist_std',
  'molecule_atom_index_1_dist_std_diff',
  'molecule_atom_index_1_dist_std_div',
  'mulliken_charge_0',
  'n_bonds_x',
  'rad_x',
  'rad_

In [57]:
for j_type in j_types:
  print(f'number of remaining features for {j_type}: {len(selected_features[j_type]) - len(drop_features[j_type])}')

number of remaining features for 1JHC: 36
number of remaining features for 2JHH: 29
number of remaining features for 1JHN: 42
number of remaining features for 2JHN: 43
number of remaining features for 2JHC: 37
number of remaining features for 3JHH: 18
number of remaining features for 3JHC: 14
number of remaining features for 3JHN: 51
