In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline  
from pandas import DataFrame


In [3]:
from lightgbm import LGBMRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
import math
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [5]:
scalar_coupling_contributions = pd.read_csv("champs-scalar-coupling/scalar_coupling_contributions.csv")
sample_submission = pd.read_csv("champs-scalar-coupling/sample_submission.csv")
potential_energy = pd.read_csv("champs-scalar-coupling/potential_energy.csv")
mulliken_charges = pd.read_csv("champs-scalar-coupling/mulliken_charges.csv")
magnetic_shielding_tensors = pd.read_csv("champs-scalar-coupling/magnetic_shielding_tensors.csv")
mulliken_charges_test_set = pd.read_csv("champs-scalar-coupling/mulliken_charges_test_set.csv")


In [6]:
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
train = pd.read_csv("champs-scalar-coupling/train.csv", index_col='id', dtype=train_dtypes)
test = pd.read_csv("champs-scalar-coupling/test.csv", index_col='id', dtype=train_dtypes)


  mask |= (ar1 == a)


In [7]:
structures = pd.read_csv("champs-scalar-coupling/structures.csv")

In [8]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [9]:
train = pd.merge(train, structures, how = 'left', left_on = ['molecule_name', 'atom_index_0'], right_on = ['molecule_name', 'atom_index'])
test = pd.merge(test, structures, how = 'left', left_on = ['molecule_name', 'atom_index_0'], right_on = ['molecule_name', 'atom_index'])
train = pd.merge(train, structures, how = 'left', left_on = ['molecule_name', 'atom_index_1'], right_on = ['molecule_name', 'atom_index'])
test = pd.merge(test, structures, how = 'left', left_on = ['molecule_name', 'atom_index_1'], right_on = ['molecule_name', 'atom_index'])

In [10]:
structure_details = pd.pivot_table(structures, index='molecule_name',  columns=['atom_index'], values=["x", "y", "z"], aggfunc= np.sum, fill_value=np.nan)
structure_df = DataFrame(structure_details.to_records())


In [11]:
atomic_numbers = DataFrame({'atom': ['H', 'C', 'N', 'O', 'F'], 'atom_number': [1,6,7,8,9]})
structures = pd.merge(structures, atomic_numbers, on ='atom', how = 'left')

In [12]:
structure_index_order = pd.pivot_table(structures, index='molecule_name',  columns=['atom_index'], values=["atom_number"], aggfunc= np.sum, fill_value=np.nan)
structure_index_order = DataFrame(structure_index_order.to_records())


In [13]:
structure_coordinates = pd.merge(structure_df, structure_index_order, on = 'molecule_name', how = 'left')
structure_coordinates.head()

Unnamed: 0,molecule_name,"('x', 0)","('x', 1)","('x', 2)","('x', 3)","('x', 4)","('x', 5)","('x', 6)","('x', 7)","('x', 8)",...,"('atom_number', 19)","('atom_number', 20)","('atom_number', 21)","('atom_number', 22)","('atom_number', 23)","('atom_number', 24)","('atom_number', 25)","('atom_number', 26)","('atom_number', 27)","('atom_number', 28)"
0,dsgdb9nsd_000001,-0.012698,0.00215,1.011731,-0.540815,-0.523814,,,,,...,,,,,,,,,,
1,dsgdb9nsd_000002,-0.040426,0.017257,0.915789,-0.520278,,,,,,...,,,,,,,,,,
2,dsgdb9nsd_000003,-0.03436,0.064766,0.87179,,,,,,,...,,,,,,,,,,
3,dsgdb9nsd_000004,0.599539,-0.599539,-1.661639,1.661639,,,,,,...,,,,,,,,,,
4,dsgdb9nsd_000005,-0.013324,0.002311,-0.027803,,,,,,,...,,,,,,,,,,


In [14]:
train.columns

Index(['molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'atom_index_x', 'atom_x', 'x_x', 'y_x',
       'z_x', 'atom_index_y', 'atom_y', 'x_y', 'y_y', 'z_y'],
      dtype='object')

In [15]:
train['x_c'] = (train['x_x'] + train['x_y'])/2
train['y_c'] = (train['y_x'] + train['y_y'])/2
train['z_c'] = (train['z_x'] + train['z_y'])/2

In [16]:
test['x_c'] = (test['x_x'] + test['x_y'])/2
test['y_c'] = (test['y_x'] + test['y_y'])/2
test['z_c'] = (test['z_x'] + test['z_y'])/2

In [17]:
test.columns

Index(['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'atom_index_x',
       'atom_x', 'x_x', 'y_x', 'z_x', 'atom_index_y', 'atom_y', 'x_y', 'y_y',
       'z_y', 'x_c', 'y_c', 'z_c'],
      dtype='object')

In [18]:
atomic_radius = DataFrame({'atom': ['H', 'C', 'N', 'O', 'F'], 'radius': [0.38, 0.77, 0.75, 0.73, 0.71]})
elect_neg = DataFrame({'atom': ['H', 'C', 'N', 'O', 'F'], 'en': [2.2, 2.2, 0.75, 3.04, 3.98]})
mag_ratio = DataFrame({'atom': ['H', 'C', 'N', 'O', 'F'], 'mag_ratio': [42.577, 10.708, 3.077, -5.772, 40.052]})
structures = pd.merge(structures, atomic_radius, on ='atom', how = 'left')
structures = pd.merge(structures, elect_neg, on ='atom', how = 'left')
structures = pd.merge(structures, mag_ratio, on ='atom', how = 'left')

In [19]:
train.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_index_x,atom_x,x_x,y_x,z_x,atom_index_y,atom_y,x_y,y_y,z_y,x_c,y_c,z_c
0,dsgdb9nsd_000001,1,0,1JHC,84.807602,1,H,0.00215,-0.006031,0.001976,0,C,-0.012698,1.085804,0.008001,-0.005274,0.539886,0.004989
1,dsgdb9nsd_000001,1,2,2JHH,-11.257,1,H,0.00215,-0.006031,0.001976,2,H,1.011731,1.463751,0.000277,0.506941,0.72886,0.001126
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,1,H,0.00215,-0.006031,0.001976,3,H,-0.540815,1.447527,-0.876644,-0.269332,0.720748,-0.437334
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,1,H,0.00215,-0.006031,0.001976,4,H,-0.523814,1.437933,0.906397,-0.260832,0.715951,0.454187
4,dsgdb9nsd_000001,2,0,1JHC,84.807404,2,H,1.011731,1.463751,0.000277,0,C,-0.012698,1.085804,0.008001,0.499516,1.274778,0.004139


In [20]:
train = pd.merge(train, structure_coordinates, on = "molecule_name", how = "left")

In [21]:
test = pd.merge(test, structure_coordinates, on = "molecule_name", how = "left")

In [22]:
train = train.drop(['atom_index_x', 'atom_index_y', 'atom_x', 'atom_y'], axis = 1)

In [23]:
test = test.drop(['atom_index_x', 'atom_index_y', 'atom_x', 'atom_y'], axis = 1)

In [24]:
def calc_distance_to_center(train):
    columns_x = ["('x', 0)", "('x', 1)", "('x', 2)", "('x', 3)", "('x', 4)", "('x', 5)", "('x', 6)", "('x', 7)", "('x', 8)", "('x', 9)", "('x', 10)", "('x', 11)", "('x', 12)", "('x', 13)", "('x', 14)", "('x', 15)", "('x', 16)", "('x', 17)", "('x', 18)", "('x', 19)", "('x', 20)", "('x', 21)", "('x', 22)", "('x', 23)", "('x', 24)", "('x', 25)", "('x', 26)", "('x', 27)", "('x', 28)"]
    columns_y = ["('y', 0)", "('y', 1)", "('y', 2)", "('y', 3)", "('y', 4)", "('y', 5)", "('y', 6)", "('y', 7)", "('y', 8)", "('y', 9)", "('y', 10)", "('y', 11)", "('y', 12)", "('y', 13)", "('y', 14)", "('y', 15)", "('y', 16)", "('y', 17)", "('y', 18)", "('y', 19)", "('y', 20)", "('y', 21)", "('y', 22)", "('y', 23)", "('y', 24)", "('y', 25)", "('y', 26)", "('y', 27)", "('y', 28)"]
    columns_z = ["('z', 0)", "('z', 1)", "('z', 2)", "('z', 3)", "('z', 4)", "('z', 5)", "('z', 6)", "('z', 7)", "('z', 8)", "('z', 9)", "('z', 10)", "('z', 11)", "('z', 12)", "('z', 13)", "('z', 14)", "('z', 15)", "('z', 16)", "('z', 17)", "('z', 18)", "('z', 19)", "('z', 20)", "('z', 21)", "('z', 22)", "('z', 23)", "('z', 24)", "('z', 25)", "('z', 26)", "('z', 27)", "('z', 28)"]

    
    dist_center_list = []

    for i in range(0, len(columns_x)):
        dist_center_list.append('dist_center_'+str(i)) 
        train['dist_center_'+str(i)] = 0

    for i in range(0, len(columns_x)):
        train['dist_center_'+str(i)] = ((train[columns_x[i]] - train['x_c'])**2 + (train[columns_y[i]] - train['y_c'])**2 + (train[columns_z[i]] - train['z_c'])**2)**0.5
    return train

In [25]:
train = calc_distance_to_center(train)

In [26]:
test = calc_distance_to_center(test)

In [27]:

train['dist'] = ((train['x_x'] - train['x_y'])**2 + (train['y_x'] - train['y_y'])**2 + (train['z_x'] - train['z_y'])**2)**0.5

In [28]:
train.head()


Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,x_x,y_x,z_x,x_y,y_y,...,dist_center_20,dist_center_21,dist_center_22,dist_center_23,dist_center_24,dist_center_25,dist_center_26,dist_center_27,dist_center_28,dist
0,dsgdb9nsd_000001,1,0,1JHC,84.807602,0.00215,-0.006031,0.001976,-0.012698,1.085804,...,,,,,,,,,,1.091953
1,dsgdb9nsd_000001,1,2,2JHH,-11.257,0.00215,-0.006031,0.001976,1.011731,1.463751,...,,,,,,,,,,1.78312
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,0.00215,-0.006031,0.001976,-0.540815,1.447527,...,,,,,,,,,,1.783147
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,0.00215,-0.006031,0.001976,-0.523814,1.437933,...,,,,,,,,,,1.783157
4,dsgdb9nsd_000001,2,0,1JHC,84.807404,1.011731,1.463751,0.000277,-0.012698,1.085804,...,,,,,,,,,,1.091952


In [29]:
test['dist'] = ((test['x_x'] - test['x_y'])**2 + (test['y_x'] - test['y_y'])**2 + (test['z_x'] - test['z_y'])**2)**0.5

In [30]:
train = train.replace(np.nan, 0)
test = test.replace(np.nan, 0)

In [31]:
#Count the number of H and C in the molecule
train_strucutre = pd.merge(structures, DataFrame({'molecule_name':train['molecule_name'].unique()}), on = ['molecule_name'], how = 'left')
test_strucutre = pd.merge(structures, DataFrame({'molecule_name':test['molecule_name'].unique()}), on = ['molecule_name'], how = 'left')




In [32]:
train_strucutre['count'] = 1
test_strucutre['count'] = 1
train_strucutre_pt = pd.pivot_table(train_strucutre, index = 'molecule_name', columns = 'atom', values = 'count', aggfunc = np.sum, fill_value = 0)
test_strucutre_pt = pd.pivot_table(test_strucutre, index = 'molecule_name', columns = 'atom', values = 'count', aggfunc = np.sum, fill_value = 0)

train_strucutre_df = DataFrame(train_strucutre_pt.to_records())
test_strucutre_df= DataFrame(test_strucutre_pt.to_records())

train['count'] = 1
test['count'] = 1
train_bond_pt = pd.pivot_table(train, index = 'molecule_name', columns = 'type', values = 'count', aggfunc = np.sum, fill_value = 0)
test_bond_pt = pd.pivot_table(test, index = 'molecule_name', columns = 'type', values = 'count', aggfunc = np.sum, fill_value = 0)

train_bond_df = DataFrame(train_bond_pt.to_records())
test_bond_df= DataFrame(test_bond_pt.to_records())



In [33]:
dipole_moments = pd.read_csv("champs-scalar-coupling/dipole_moments.csv")



In [34]:
def dataMerge(train, structures, scalar_coupling_contributions, potential_energy,magnetic_shielding_tensors,dipole_moments, train_strucutre_df, train_bond_df, mulliken_charges):
    print(len(train))
    #train = pd.merge(train, structures, left_on = ['molecule_name', 'atom_index_0'], right_on = ['molecule_name', 'atom_index'])
    train = pd.merge(train, train_strucutre_df, on = 'molecule_name', how = "left")
    train = pd.merge(train, train_bond_df, on = 'molecule_name', how = "left")
    
    print(len(train))
    #train = pd.merge(train, structures, left_on = ['molecule_name', 'atom_index_1'], right_on = ['molecule_name', 'atom_index'])
    print(len(train))
    #train = pd.merge(train, scalar_coupling_contributions, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
    #print(len(train))
    #train = pd.merge(train, potential_energy, on = 'molecule_name')
    #print(len(train))
    train = pd.merge(train, mulliken_charges,  left_on = ['molecule_name', 'atom_index_0'], right_on = ['molecule_name', 'atom_index'], how = "left")
    print(len(train))
    train = pd.merge(train, mulliken_charges,  left_on = ['molecule_name', 'atom_index_1'], right_on = ['molecule_name', 'atom_index'],how = "left")
    print(len(train))
    #train = pd.merge(train, magnetic_shielding_tensors,  left_on = ['molecule_name', 'atom_index_0'], right_on = ['molecule_name', 'atom_index'])
    print(len(train))
    #train = pd.merge(train, magnetic_shielding_tensors,  left_on = ['molecule_name', 'atom_index_1'], right_on = ['molecule_name', 'atom_index'])
    #train = pd.merge(train, dipole_moments, on = 'molecule_name')
    print(len(train))
    return train

In [35]:
train = dataMerge(train, structures, scalar_coupling_contributions, potential_energy,magnetic_shielding_tensors,dipole_moments, train_strucutre_df, train_bond_df, mulliken_charges)
test = dataMerge(test, structures, scalar_coupling_contributions, potential_energy,magnetic_shielding_tensors,dipole_moments, test_strucutre_df, test_bond_df, mulliken_charges_test_set)

4658147
4658147
4658147
4658147
4658147
4658147
4658147
2505542
2505542
2505542
2505542
2505542
2505542
2505542


In [36]:
train['coupling_type'] = train['type'].str[1:]
test['coupling_type'] = test['type'].str[1:]
type_dummy = pd.get_dummies(train['coupling_type'])
train = pd.concat([train, type_dummy], axis = 1)
type_dummy = pd.get_dummies(test['coupling_type'])
test = pd.concat([test, type_dummy], axis = 1)




In [31]:
distance_mean = []
distance_min = []
distance_max = []
bond_types = train['type'].unique()
for bond in bond_types:
    distance_mean.append(train[train['type'] == bond]['dist'].mean())
    distance_min.append(train[train['type'] == bond]['dist'].min())
    distance_max.append(train[train['type'] == bond]['dist'].max())
distance_mean_df = DataFrame({'type': bond_types, 'distance_mean': distance_mean, 'distance_min': distance_min, 'distance_max': distance_max })
train = pd.merge(train, distance_mean_df, on = 'type', how = 'left')
test = pd.merge(test, distance_mean_df, on = 'type', how = 'left')

In [None]:
train['n_bonds'] = train['1JHC'] + train['1JHN'] 
test['n_bonds'] = test['1JHC'] + test['1JHN'] 

In [33]:
train['n_bonds_2'] = train['2JHC'] + train['2JHN'] + train['2JHH'] 
test['n_bonds_2'] = test['2JHC'] + test['2JHN'] + test['2JHH'] 
train['n_bonds_3'] = train['3JHC'] + train['3JHN'] + train['3JHH'] 
test['n_bonds_3'] = test['3JHC'] + test['3JHN'] + test['3JHH'] 

In [34]:
train['total_atom'] = train['C'] + train['F'] + train['H'] + train['N']+ train['O']
test['total_atom'] = test['C'] + test['F'] + test['H'] + test['N']+ test['O']

In [37]:
list(train.columns)

['molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'x_x',
 'y_x',
 'z_x',
 'x_y',
 'y_y',
 'z_y',
 'x_c',
 'y_c',
 'z_c',
 "('x', 0)",
 "('x', 1)",
 "('x', 2)",
 "('x', 3)",
 "('x', 4)",
 "('x', 5)",
 "('x', 6)",
 "('x', 7)",
 "('x', 8)",
 "('x', 9)",
 "('x', 10)",
 "('x', 11)",
 "('x', 12)",
 "('x', 13)",
 "('x', 14)",
 "('x', 15)",
 "('x', 16)",
 "('x', 17)",
 "('x', 18)",
 "('x', 19)",
 "('x', 20)",
 "('x', 21)",
 "('x', 22)",
 "('x', 23)",
 "('x', 24)",
 "('x', 25)",
 "('x', 26)",
 "('x', 27)",
 "('x', 28)",
 "('y', 0)",
 "('y', 1)",
 "('y', 2)",
 "('y', 3)",
 "('y', 4)",
 "('y', 5)",
 "('y', 6)",
 "('y', 7)",
 "('y', 8)",
 "('y', 9)",
 "('y', 10)",
 "('y', 11)",
 "('y', 12)",
 "('y', 13)",
 "('y', 14)",
 "('y', 15)",
 "('y', 16)",
 "('y', 17)",
 "('y', 18)",
 "('y', 19)",
 "('y', 20)",
 "('y', 21)",
 "('y', 22)",
 "('y', 23)",
 "('y', 24)",
 "('y', 25)",
 "('y', 26)",
 "('y', 27)",
 "('y', 28)",
 "('z', 0)",
 "('z', 1)",
 "('z', 2)",
 "('z',

In [69]:
train[[ 'dist_atom0_0',
 'dist_atom1_0',
 'dist_atom0_1',
 'dist_atom1_1',
 'dist_atom0_2',
 'dist_atom1_2',
 'dist_atom0_3',
 'dist_atom1_3',
 'dist_atom0_4',
 'dist_atom1_4',
 'dist_atom0_5',
 'dist_atom1_5',
 'dist_atom0_6',
 'dist_atom1_6',
 'dist_atom0_7',
 'dist_atom1_7',
 'dist_atom0_8',
 'dist_atom1_8',
 'dist_atom0_9',
 'dist_atom1_9',
 'dist_atom0_10',
 'dist_atom1_10',
 'dist_atom0_11',
 'dist_atom1_11',
 'dist_atom0_12',
 'dist_atom1_12',
 'dist_atom0_13',
 'dist_atom1_13',
 'dist_atom0_14',
 'dist_atom1_14',
 'dist_atom0_15',
 'dist_atom1_15',
 'dist_atom0_16',
 'dist_atom1_16',
 'dist_atom0_17',
 'dist_atom1_17',
 'dist_atom0_18',
 'dist_atom1_18',
 'dist_atom0_19',
 'dist_atom1_19',
 'dist_atom0_20',
 'dist_atom1_20',
 'dist_atom0_21',
 'dist_atom1_21',
 'dist_atom0_22',
 'dist_atom1_22',
 'dist_atom0_23',
 'dist_atom1_23',
 'dist_atom0_24',
 'dist_atom1_24',
 'dist_atom0_25',
 'dist_atom1_25',
 'dist_atom0_26',
 'dist_atom1_26',
 'dist_atom0_27',
 'dist_atom1_27',
 'dist_atom0_28',
 'dist_atom1_28']]

Unnamed: 0,dist_atom0_0,dist_atom1_0,dist_atom0_1,dist_atom1_1,dist_atom0_2,dist_atom1_2,dist_atom0_3,dist_atom1_3,dist_atom0_4,dist_atom1_4,...,dist_atom0_24,dist_atom1_24,dist_atom0_25,dist_atom1_25,dist_atom0_26,dist_atom1_26,dist_atom0_27,dist_atom1_27,dist_atom0_28,dist_atom1_28
0,1.091953,0.000000,0.000000,1.091953,1.783120,1.091952,1.783147,1.091946,1.783157,1.091948,...,0.006701,1.085908,0.006701,1.085908,0.006701,1.085908,0.006701,1.085908,0.006701,1.085908
1,1.091952,0.000000,1.783120,1.091953,0.000000,1.091952,1.783158,1.091946,1.783148,1.091948,...,1.779373,1.085908,1.779373,1.085908,1.779373,1.085908,1.779373,1.085908,1.779373,1.085908
2,1.091946,0.000000,1.783147,1.091953,1.783158,1.091952,0.000000,1.091946,1.783148,1.091948,...,1.776603,1.085908,1.776603,1.085908,1.776603,1.085908,1.776603,1.085908,1.776603,1.085908
3,1.091948,0.000000,1.783157,1.091953,1.783148,1.091952,1.783148,1.091946,0.000000,1.091948,...,1.778648,1.085908,1.778648,1.085908,1.778648,1.085908,1.778648,1.085908,1.778648,1.085908
4,1.091953,1.091952,0.000000,1.783120,1.783120,0.000000,1.783147,1.783158,1.783157,1.783148,...,0.006701,1.779373,0.006701,1.779373,0.006701,1.779373,0.006701,1.779373,0.006701,1.779373
5,1.091953,1.091946,0.000000,1.783147,1.783120,1.783158,1.783147,0.000000,1.783157,1.783148,...,0.006701,1.776603,0.006701,1.776603,0.006701,1.776603,0.006701,1.776603,0.006701,1.776603
6,1.091952,1.091946,1.783120,1.783147,0.000000,1.783158,1.783158,0.000000,1.783148,1.783148,...,1.779373,1.776603,1.779373,1.776603,1.779373,1.776603,1.779373,1.776603,1.779373,1.776603
7,1.091953,1.091948,0.000000,1.783157,1.783120,1.783148,1.783147,1.783148,1.783157,0.000000,...,0.006701,1.778648,0.006701,1.778648,0.006701,1.778648,0.006701,1.778648,0.006701,1.778648
8,1.091952,1.091948,1.783120,1.783157,0.000000,1.783148,1.783158,1.783148,1.783148,0.000000,...,1.779373,1.778648,1.779373,1.778648,1.779373,1.778648,1.779373,1.778648,1.779373,1.778648
9,1.091946,1.091948,1.783147,1.783157,1.783158,1.783148,0.000000,1.783148,1.783148,0.000000,...,1.776603,1.778648,1.776603,1.778648,1.776603,1.778648,1.776603,1.778648,1.776603,1.778648


In [74]:
train_dummy = train[['dist_atom0_0',
 'dist_atom1_0',
 'dist_atom0_1',
 'dist_atom1_1',
 'dist_atom0_2',
 'dist_atom1_2',
 'dist_atom0_3',
 'dist_atom1_3',
 'dist_atom0_4',
 'dist_atom1_4',
 'dist_atom0_5',
 'dist_atom1_5',
 'dist_atom0_6',
 'dist_atom1_6',
 'dist_atom0_7',
 'dist_atom1_7',
 'dist_atom0_8',
 'dist_atom1_8',
 'dist_atom0_9',
 'dist_atom1_9',
 'dist_atom0_10',
 'dist_atom1_10',
 'dist_atom0_11',
 'dist_atom1_11',
 'dist_atom0_12',
 'dist_atom1_12',
 'dist_atom0_13',
 'dist_atom1_13',
 'dist_atom0_14',
 'dist_atom1_14',
 'dist_atom0_15',
 'dist_atom1_15',
 'dist_atom0_16',
 'dist_atom1_16',
 'dist_atom0_17',
 'dist_atom1_17',
 'dist_atom0_18',
 'dist_atom1_18',
 'dist_atom0_19',
 'dist_atom1_19',
 'dist_atom0_20',
 'dist_atom1_20',
 'dist_atom0_21',
 'dist_atom1_21',
 'dist_atom0_22',
 'dist_atom1_22',
 'dist_atom0_23',
 'dist_atom1_23',
 'dist_atom0_24',
 'dist_atom1_24',
 'dist_atom0_25',
 'dist_atom1_25',
 'dist_atom0_26',
 'dist_atom1_26',
 'dist_atom0_27',
 'dist_atom1_27',
 'dist_atom0_28',
 'dist_atom1_28']]

In [92]:
test_dummy = test[['dist_atom0_0',
 'dist_atom1_0',
 'dist_atom0_1',
 'dist_atom1_1',
 'dist_atom0_2',
 'dist_atom1_2',
 'dist_atom0_3',
 'dist_atom1_3',
 'dist_atom0_4',
 'dist_atom1_4',
 'dist_atom0_5',
 'dist_atom1_5',
 'dist_atom0_6',
 'dist_atom1_6',
 'dist_atom0_7',
 'dist_atom1_7',
 'dist_atom0_8',
 'dist_atom1_8',
 'dist_atom0_9',
 'dist_atom1_9',
 'dist_atom0_10',
 'dist_atom1_10',
 'dist_atom0_11',
 'dist_atom1_11',
 'dist_atom0_12',
 'dist_atom1_12',
 'dist_atom0_13',
 'dist_atom1_13',
 'dist_atom0_14',
 'dist_atom1_14',
 'dist_atom0_15',
 'dist_atom1_15',
 'dist_atom0_16',
 'dist_atom1_16',
 'dist_atom0_17',
 'dist_atom1_17',
 'dist_atom0_18',
 'dist_atom1_18',
 'dist_atom0_19',
 'dist_atom1_19',
 'dist_atom0_20',
 'dist_atom1_20',
 'dist_atom0_21',
 'dist_atom1_21',
 'dist_atom0_22',
 'dist_atom1_22',
 'dist_atom0_23',
 'dist_atom1_23',
 'dist_atom0_24',
 'dist_atom1_24',
 'dist_atom0_25',
 'dist_atom1_25',
 'dist_atom0_26',
 'dist_atom1_26',
 'dist_atom0_27',
 'dist_atom1_27',
 'dist_atom0_28',
 'dist_atom1_28']]

In [82]:
train_dummy[0:200].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
dist_atom0_0,1.091953,1.091952,1.091946,1.091948,1.091953,1.091953,1.091952,1.091953,1.091952,1.091946,...,2.842457,3.503196,2.842803,1.095999,1.090458,1.096008,2.842457,3.503196,2.842803,1.095999
dist_atom1_0,0.0,0.0,0.0,0.0,1.091952,1.091946,1.091946,1.091948,1.091948,1.091948,...,0.0,0.0,0.0,1.519954,1.519954,1.519954,1.519954,1.519954,1.519954,2.580272
dist_atom0_1,0.0,1.78312,1.783147,1.783157,0.0,0.0,1.78312,0.0,1.78312,1.783147,...,2.161349,2.15163,2.161303,2.161361,2.151629,2.161302,2.161349,2.15163,2.161303,2.161361
dist_atom1_1,1.091953,1.091953,1.091953,1.091953,1.78312,1.783147,1.783147,1.783157,1.783157,1.783157,...,1.519954,1.519954,1.519954,0.0,0.0,0.0,0.0,0.0,0.0,1.519952
dist_atom0_2,1.78312,0.0,1.783158,1.783148,1.78312,1.78312,0.0,1.78312,0.0,1.783158,...,1.096,1.090458,1.096006,2.842457,3.503194,2.842817,1.096,1.090458,1.096006,2.842457
dist_atom1_2,1.091952,1.091952,1.091952,1.091952,0.0,1.783158,1.783158,1.783148,1.783148,1.783148,...,2.580272,2.580272,2.580272,1.519952,1.519952,1.519952,1.519952,1.519952,1.519952,0.0
dist_atom0_3,1.783147,1.783158,0.0,1.783148,1.783147,1.783147,1.783158,1.783147,1.783158,0.0,...,3.107027,2.530764,3.10672,3.107061,2.530762,3.106696,3.107027,2.530764,3.10672,3.107061
dist_atom1_3,1.091946,1.091946,1.091946,1.091946,1.783158,0.0,0.0,1.783148,1.783148,1.783148,...,2.390344,2.390344,2.390344,1.208663,1.208663,1.208663,1.208663,1.208663,1.208663,2.390341
dist_atom0_4,1.783157,1.783148,1.783148,0.0,1.783157,1.783157,1.783148,1.783157,1.783148,1.783148,...,3.193355,3.827278,2.666122,0.0,1.787755,1.758779,3.193355,3.827278,2.666122,0.0
dist_atom1_4,1.091948,1.091948,1.091948,1.091948,1.783148,1.783148,1.783148,0.0,0.0,0.0,...,1.095999,1.095999,1.095999,2.161361,2.161361,2.161361,2.161361,2.161361,2.161361,2.842457


In [89]:
train_dummy_t = train_dummy.T
#train_sorted = train_dummy_t.sort_values(list(train_dummy_t.columns),axis = 0)
train_sorted = DataFrame(np.sort(train_dummy_t.values, axis=0), index=train_dummy_t.index, columns=train_dummy_t.columns)
train_sorted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4658137,4658138,4658139,4658140,4658141,4658142,4658143,4658144,4658145,4658146
dist_atom0_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dist_atom1_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dist_atom0_1,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.091946,0.006701,1.091948,1.091946,...,1.080997,1.080997,1.080997,1.080997,1.080997,1.080997,1.080997,1.083421,1.083421,1.094292
dist_atom1_1,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.091952,0.006701,1.091952,1.091948,...,1.09048,1.080997,1.09048,1.083421,1.080997,1.083421,1.09048,1.094292,1.094293,1.094293
dist_atom0_2,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.509711,1.509711,2.209032,2.276213,2.293248,2.276213,2.114075,1.784424,1.784424,1.784424
dist_atom1_2,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.50993,1.50993,2.209032,2.276215,2.293248,2.276215,2.114075,2.20949,2.209489,1.784424
dist_atom0_3,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.532828,1.532828,2.209032,2.293248,2.306535,2.293248,2.114075,2.21175,2.211755,2.209489
dist_atom1_3,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.553213,1.553213,2.209032,2.305492,2.306538,2.305492,2.114075,2.276213,2.276213,2.20949
dist_atom0_4,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.553213,1.553213,2.209032,2.306535,2.312202,2.306538,2.114075,2.276215,2.276215,2.21175
dist_atom1_4,0.006701,1.085908,1.085908,1.085908,0.006701,0.006701,1.776603,0.006701,1.778648,1.776603,...,1.553213,1.553213,2.209032,2.312206,2.312206,2.312202,2.114075,2.305492,2.305492,2.211755


In [91]:
train = pd.concat([train[['dist','C',
 'F',
 'H',
 'N',
 'O',
 '1JHC',
 '1JHN',
 '2JHC',
 '2JHH',
 '2JHN',
 '3JHC',
 '3JHH',
 '3JHN',
 'mulliken_charge_x',
 'mulliken_charge_y',
 'x_min',
 'y_min',
 'z_min',
 'x_max',
 'y_max',
 'z_max',
 'max_x_distance',
 'max_y_distance',
 'max_z_distance',
 'distance_mean',
 'distance_min',
 'distance_max',
 'n_bonds',
 'n_bonds_2',
 'n_bonds_3',
 'total_atom','scalar_coupling_constant']], train_sorted.T], axis = 1)

In [30]:
train = get_max_distance(train)
test = get_max_distance(test)

In [93]:
test_dummy_t = test_dummy.T
test_sorted = DataFrame(np.sort(test_dummy_t.values, axis=0), index=test_dummy_t.index, columns=test_dummy_t.columns)

test = pd.concat([test[['dist','C',
 'F',
 'H',
 'N',
 'O',
 '1JHC',
 '1JHN',
 '2JHC',
 '2JHH',
 '2JHN',
 '3JHC',
 '3JHH',
 '3JHN',
 'mulliken_charge_x',
 'mulliken_charge_y',
 'x_min',
 'y_min',
 'z_min',
 'x_max',
 'y_max',
 'z_max',
 'max_x_distance',
 'max_y_distance',
 'max_z_distance',
 'distance_mean',
 'distance_min',
 'distance_max',
 'n_bonds',
 'n_bonds_2',
 'n_bonds_3',
 'total_atom', 'id']], test_sorted.T], axis = 1)

In [140]:
test.head()


Unnamed: 0,dist,C,F,H,N,O,1JHC,1JHN,2JHC,2JHH,...,dist_atom0_24,dist_atom1_24,dist_atom0_25,dist_atom1_25,dist_atom0_26,dist_atom1_26,dist_atom0_27,dist_atom1_27,dist_atom0_28,dist_atom1_28
0,2.261178,2,0,2,0,0,2,0,2,0,...,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,2.261178,2.261178,3.323277
1,1.062099,2,0,2,0,0,2,0,2,0,...,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,2.261178,2.261178,3.323277
2,1.062099,2,0,2,0,0,2,0,2,0,...,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,2.261178,2.261178,3.323277
3,2.261178,2,0,2,0,0,2,0,2,0,...,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,2.261178,2.261178,3.323277
4,3.323277,2,0,2,0,0,2,0,2,0,...,1.939341,1.939341,1.939341,1.939341,1.939341,1.939341,2.261178,2.261178,3.323277,3.323277


In [53]:
feature_list = ["('atom_number', 0)",
 "('atom_number', 1)",
 "('atom_number', 2)",
 "('atom_number', 3)",
 "('atom_number', 4)",
 "('atom_number', 5)",
 "('atom_number', 6)",
 "('atom_number', 7)",
 "('atom_number', 8)",
 "('atom_number', 9)",
 "('atom_number', 10)",
 "('atom_number', 11)",
 "('atom_number', 12)",
 "('atom_number', 13)",
 "('atom_number', 14)",
 "('atom_number', 15)",
 "('atom_number', 16)",
 "('atom_number', 17)",
 "('atom_number', 18)",
 "('atom_number', 19)",
 "('atom_number', 20)",
 "('atom_number', 21)",
 "('atom_number', 22)",
 "('atom_number', 23)",
 "('atom_number', 24)",
 "('atom_number', 25)",
 "('atom_number', 26)",
 "('atom_number', 27)",
 "('atom_number', 28)",
 'dist_center_0',
 'dist_center_1',
 'dist_center_2',
 'dist_center_3',
 'dist_center_4',
 'dist_center_5',
 'dist_center_6',
 'dist_center_7',
 'dist_center_8',
 'dist_center_9',
 'dist_center_10',
 'dist_center_11',
 'dist_center_12',
 'dist_center_13',
 'dist_center_14',
 'dist_center_15',
 'dist_center_16',
 'dist_center_17',
 'dist_center_18',
 'dist_center_19',
 'dist_center_20',
 'dist_center_21',
 'dist_center_22',
 'dist_center_23',
 'dist_center_24',
 'dist_center_25',
 'dist_center_26',
 'dist_center_27',
 'dist_center_28',
 'dist',
 'C',
 'F',
 'H',
 'N',
 'O',
 '1JHC',
 '1JHN',
 '2JHC',
 '2JHH',
 '2JHN',
 '3JHC',
 '3JHH',
 '3JHN',
 'mulliken_charge_x',
 'mulliken_charge_y',
 'JHC',
 'JHH',
 'JHN']
#test = test.sample(n = int(len(test)))

In [54]:
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'num_leaves': 128,
    'min_child_samples': 79,
    'max_depth': 15,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0
}

In [55]:
def train_model(train, y_feature):
    n = len(train)
    print(n)
    train_start = 0
    ratio = 1
    train_end = int(np.floor(0.8*n)*ratio)
    test_start = int(np.floor(0.8*n)*ratio)
    test_end = int(n*ratio)
    data_train = train[train_start:train_end]
    data_test = train[test_start:test_end]

    train_data = data_train[feature_list]
    test_data = data_test[feature_list]
    pred_data = test[feature_list]
    #Build X and y
    X_train = train_data
    y_train = data_train[feature]

    X_test = test_data
    y_test = data_test[y_feature]

    X_pred = pred_data
    
    # Fit regression model
    model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)

    #model = RandomForestRegressor(max_depth = 15)
    #model = xgb.XGBRegressor(max_depth = 10)
    #optimization_dict = {'max_depth': [10,12],
                         #'n_estimators': [100]}
    #model = GridSearchCV(xgb_model, optimization_dict)
    model.fit(X_train, y_train)
    print("Model training complete")
    #print(model.best_score_)
    #print(model.best_params_)
    regr_score = model.score(X_test, y_test)
    print(regr_score)
    y_predict = model.predict(X_test)
    #print(regr_1.__class__.__name__, accuracy_score(y_test, y_predict))
    # Predict
    
    feature_importances = pd.DataFrame(model.feature_importances_,
                                       index=X_train.columns,
                                       columns=['importance']).sort_values('importance', ascending=False)
    print(feature_importances)
    
    test_results = DataFrame({"real":y_test, "predict":y_predict})
    mse = sum((test_results['real'] -  test_results['predict'])**2)/len(test_results)
    print(mse)
    #predict = model.predict(X_pred)
    #results = DataFrame({"id": test_id, feature: predict})
    #results = DataFrame({feature: predict})
    #results = results.sort_values("id")
    return test_results

In [56]:

#combined_results = test[['id']]
#y_feature = ['fc', 'sd', 'pso', 'dso']
y_feature = ['scalar_coupling_constant']
for feature in y_feature:
    model_results = train_model(train, feature)
    #test_results = model_results[0]
    #results = model_results[1]
    #combined_results = pd.merge(combined_results, results, on = ['id'])
#combined_results['scalar_coupling_constant'] = combined_results['fc'] + combined_results['sd'] + combined_results['pso']  + combined_results['dso'] 

combined_results[['id', 'scalar_coupling_constant']].to_csv("./submission_results_rf.csv", index=False)

4658147
Model training complete
0.9983951836197491
                     importance
dist                      19371
mulliken_charge_y         12625
dist_center_3             10692
dist_center_4             10154
dist_center_2             10107
dist_center_5             10072
dist_center_7              9813
mulliken_charge_x          9790
dist_center_6              9777
dist_center_1              9729
dist_center_8              8779
dist_center_0              6662
dist_center_12             4924
dist_center_11             4373
dist_center_9              4277
dist_center_10             4232
dist_center_13             4090
dist_center_14             3608
dist_center_15             2919
dist_center_16             2349
3JHC                       2310
3JHH                       2287
2JHC                       1998
2JHH                       1948
('atom_number', 8)         1765
2JHN                       1754
dist_center_17             1695
dist_center_18             1188
JHC                  

NameError: name 'combined_results' is not defined

In [112]:
test[['dist','dist_atom0_0',
 'dist_atom1_0',
 'dist_atom0_1',
 'dist_atom1_1']].head()

Unnamed: 0,dist,dist_atom0_0,dist_atom1_0,dist_atom0_1,dist_atom1_1
0,2.261178,0.0,0.0,1.062099,1.062099
1,1.062099,0.0,0.0,1.062099,1.062099
2,1.062099,0.0,0.0,1.062099,1.062099
3,2.261178,0.0,0.0,1.062099,1.062099
4,3.323277,0.0,0.0,1.062099,1.062099


In [136]:
combined_results[['id', 'scalar_coupling_constant']].to_csv("./submission_results_rf.csv", index=False)

In [141]:
combined_results.head()

Unnamed: 0,id,scalar_coupling_constant
0,4658147,16.928673
1,4658148,186.086577
2,4658149,186.086577
3,4658150,16.928673
4,4658151,2.958189


In [105]:
test_results

Unnamed: 0,real,predict
372,2.353450,2.285658
373,2.350540,2.285658
374,10.988900,10.278970
375,-7.336370,-5.538503
376,-7.301610,-5.538503
377,-11.627100,-14.094773
378,-11.410300,-12.567588
379,-11.409100,-12.567588
380,3.439720,3.313866
381,12.524100,10.565710


In [29]:
def get_max_distance(train):
    train_min = train[['molecule_name','x_x', 'y_x', 'z_x', 'x_y', 'y_y', 'z_y']].groupby('molecule_name', as_index = False).min()
    train_max = train[['molecule_name','x_x', 'y_x', 'z_x', 'x_y', 'y_y','z_y']].groupby('molecule_name', as_index = False).max()
    train_min['x_min'] = train_min[['x_x', 'x_y']].min(axis = 1)
    train_min['y_min'] = train_min[['y_x', 'y_y']].min(axis = 1)
    train_min['z_min'] = train_min[['z_x', 'z_y']].min(axis = 1)
    train_max['x_max'] = train_max[['x_x', 'x_y']].max(axis = 1)
    train_max['y_max'] = train_max[['y_x', 'y_y']].max(axis = 1)
    train_max['z_max'] = train_max[['z_x', 'z_y']].max(axis = 1)
    train_min_max = pd.merge(train_min,train_max, on = 'molecule_name')
    train_min_max['max_x_distance'] = train_min_max['x_max'] - train_min_max['x_min']
    train_min_max['max_y_distance'] = train_min_max['y_max'] - train_min_max['y_min']
    train_min_max['max_z_distance'] = train_min_max['z_max'] - train_min_max['z_min']
    ans = pd.merge(train, train_min_max, on = 'molecule_name', how = 'left')
    return ans







In [14]:
train = pd.merge(train, scalar_coupling_contributions, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], how = 'left')
test = pd.merge(test, scalar_coupling_contributions, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], how = 'left')


In [None]:
unique_type = scalar_coupling_contributions["type"].unique()
for type in unique_type:
    train[train["type"] == type]["scalar_coupling_constant"].hist()  
for type in unique_type:
    scalar_coupling_contributions[scalar_coupling_contributions["type"] == type]["fc"].hist()   
    
for type in unique_type:
    scalar_coupling_contributions[scalar_coupling_contributions["type"] == type]["sd"].hist()  
    
for type in unique_type:
    scalar_coupling_contributions[scalar_coupling_contributions["type"] == type]["pso"].hist()   
for type in unique_type:
    scalar_coupling_contributions[scalar_coupling_contributions["type"] == type]["dso"].hist()   
            

In [None]:
def construct_distance(train, n_atom):
    dist_df = []
    for i in range(0, len(train)):
        dist_set = []
        core_atom_pair = [str(train['atom_index_0'][i]), str(train['atom_index_1'][i])]
        dist_set.append(((train["('x', "+ core_atom_pair[0] +")"][i] - train["('x', "+ core_atom_pair[1] +")"][i])**2
                                + (train["('y', "+ core_atom_pair[0] +")"][i] - train["('y', "+ core_atom_pair[1] +")"][i])**2
                                 +(train["('z', "+ core_atom_pair[0] +")"][i] - train["('z', "+ core_atom_pair[1] +")"][i])**2)**0.5)
        for res_atom in range(0, n_atom):
            res_atom = str(res_atom)
            if res_atom not in core_atom_pair:
                for core in core_atom_pair:
                    if train["('x', "+ res_atom +")"][i] == 0:
                        dist_set.append(0)
                    else:
                        dist_set.append(((train["('x', "+ core +")"][i] - train["('x', "+ res_atom +")"][i])**2
                            + (train["('y', "+ core +")"][i] - train["('y', "+ res_atom +")"][i])**2
                            + (train["('z', "+ core +")"][i] - train["('z', "+ res_atom +")"][i])**2)**0.5)
        dist_df.append(dist_set)
    dist = DataFrame(dist_df, columns = ['dist' + str(item) for item in range(0, (n_atom-2)*2+1)])
    train = pd.concat([train, dist], axis = 1)
    return train
