In [1]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import math
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)

In [None]:
gc.collect()

33

In [None]:
# Read Files 
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
structures = pd.read_csv('structures.csv')
contribut = pd.read_csv('scalar_coupling_contributions.csv')
molecfeat = pd.read_csv('molec_features_pywindow.csv')
obtrain = pd.read_csv('df_train_openbabel.csv')
obtest = pd.read_csv('df_test_openbabel.csv')
ob_mol_train = pd.read_csv('df_train_molec_ob.csv')
ob_mol_test = pd.read_csv('df_test_molec_ob.csv')
ob_charge_train = pd.read_csv('train_ob_charges.csv')
ob_charge_test = pd.read_csv('test_ob_charges.csv')
df_dipole_train = pd.read_csv('df_dipole_train.csv')
df_dipole_test = pd.read_csv('df_dipole_test.csv')

In [None]:
structures.head()

In [None]:
#contribut['sd_pso_dso'] =  contribut.apply(lambda x: x['sd'] + x['dso'] + x['pso'], axis=1)

In [None]:
#df = df.merge(contribut[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'dso']], how = 'left', on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
#df = df.merge(contribut[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'sd_pso_dso']], how = 'left', on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [None]:
#df[['scalar_coupling_constant','dso']].corr(method='pearson')
#df[['scalar_coupling_constant','sd_pso_dso']].corr(method='pearson')

In [None]:
df = df.merge(molecfeat, on = 'molecule_name')
df = df.merge(obtrain, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
df = df.merge(ob_mol_train, on = 'molecule_name')
df = df.merge(df_dipole_train, on = 'molecule_name')
df_test = df_test.merge(molecfeat, on = 'molecule_name')
df_test = df_test.merge(obtest, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
df_test = df_test.merge(ob_mol_test, on = 'molecule_name')
df_test = df_test.merge(df_dipole_test, on = 'molecule_name')

In [None]:
df_test.head()

In [None]:
df.head()

In [None]:
def prepare_structures(structures):

        # Caluculate electronegativity and radius for atoms in structures
        atomic_radius = {'H':0.38, 'C':0.77, 'N':0.75, 'O':0.73, 'F':0.71} # Without fudge factor

        fudge_factor = 0.05
        atomic_radius = {k:v + fudge_factor for k,v in atomic_radius.items()}
        print(atomic_radius)

        electronegativity = {'H':2.2, 'C':2.55, 'N':3.04, 'O':3.44, 'F':3.98}

        #structures = pd.read_csv(structures, dtype={'atom_index':np.int8})

        atoms = structures['atom'].values
        atoms_en = [electronegativity[x] for x in tqdm(atoms)]
        atoms_rad = [atomic_radius[x] for x in tqdm(atoms)]

        structures['EN'] = atoms_en
        structures['rad'] = atoms_rad

        display(structures.head())

        #Chemical Bond Calculation
        i_atom = structures['atom_index'].values
        p = structures[['x', 'y', 'z']].values
        p_compare = p
        m = structures['molecule_name'].values
        m_compare = m
        r = structures['rad'].values
        r_compare = r

        source_row = np.arange(len(structures))
        max_atoms = 28

        bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
        bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)

        print('Calculating bonds')

        for i in tqdm(range(max_atoms-1)):
            p_compare = np.roll(p_compare, -1, axis=0)
            m_compare = np.roll(m_compare, -1, axis=0)
            r_compare = np.roll(r_compare, -1, axis=0)

            mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
            dists = np.linalg.norm(p - p_compare, axis=1) * mask
            r_bond = r + r_compare

            bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)

            source_row = source_row
            target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
            target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row

            source_atom = i_atom
            target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
            target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col

            bonds[(source_row, target_atom)] = bond
            bonds[(target_row, source_atom)] = bond
            bond_dists[(source_row, target_atom)] = dists
            bond_dists[(target_row, source_atom)] = dists

        bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
        bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
        bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
        bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col

        print('Counting and condensing bonds')

        bonds_numeric = [[i for i,x in enumerate(row) if x] for row in tqdm(bonds)]
        bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate(tqdm(bond_dists))]
        bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
        bond_lengths_std = [ np.std(x) for x in bond_lengths]
        n_bonds = [len(x) for x in bonds_numeric]

        #bond_data = {'bond_' + str(i):col for i, col in enumerate(np.transpose(bonds))}
        #bond_data.update({'bonds_numeric':bonds_numeric, 'n_bonds':n_bonds})

        bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean,'bond_lengths_std':bond_lengths_std }
        bond_df = pd.DataFrame(bond_data)
        structures = structures.join(bond_df)
        
        # Calculate Gradient 
        coord = structures[['x', 'y', 'z']].values
        structures['Gradient'] = [sum(np.gradient(v)) for v in coord]
        
        display(structures.head(20))
        
        return structures

# Map atom info from structures
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}',
                            'EN': f'EN_{atom_idx}',
                            'rad': f'rad_{atom_idx}',
                            'n_bonds':f'n_bonds_{atom_idx}',
                            'bond_lengths_mean': f'bond_lengths_mean_{atom_idx}',
                            'bond_lengths_std': f'bond_lengths_std_{atom_idx}',
                            'Gradient' : f'Gradient_{atom_idx}'})
    return df

def map_atom_charges(df, df_charges, atom_idx):
    df = pd.merge(df, df_charges, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'eem': f'eem_charge_{atom_idx}', 
                            'mmff94':f'mmff94_charge_{atom_idx}',
                            'gasteiger':f'gasteiger_charge_{atom_idx}', 
                            'qeq':f'qeq_charge_{atom_idx}', 
                            'qtpie':f'qtpie_charge_{atom_idx}', 
                            'eem2015ha':f'eem2015ha_charge_{atom_idx}', 
                            'eem2015hm':f'eem2015hm_charge_{atom_idx}', 
                            'eem2015hn':f'eem2015hn_charge_{atom_idx}', 
                            'eem2015ba':f'eem2015ba_charge_{atom_idx}', 
                            'eem2015bm':f'eem2015bm_charge_{atom_idx}', 
                            'eem2015bn':f'eem2015bn_charge_{atom_idx}' 
                             })
    return df


def create_features(df):   
    # Calculate distance between atoms
    t_p_0 = df[['x_0', 'y_0', 'z_0']].values
    t_p_1 = df[['x_1', 'y_1', 'z_1']].values

    df['dist'] = np.linalg.norm(t_p_0 - t_p_1, axis=1)
    df['dist_x'] = (df['x_0'] - df['x_1']) ** 2
    df['dist_y'] = (df['y_0'] - df['y_1']) ** 2
    df['dist_z'] = (df['z_0'] - df['z_1']) ** 2
    
    # First letter of type
    df['type_0'] = df['type'].apply(lambda x: x[0])
    
    # Calculate angle between atoms
    df['Angle']  = [np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) for v1,v2 in zip(t_p_0, t_p_1)]
    # Calculate dot product of 2 sets of coordinates
    df['dot_product_coordinates']  = [np.dot(v1, v2) for v1,v2 in zip(t_p_0, t_p_1)]
    # Calculate Euclidean length
    df['euclidean_length']  = [math.sqrt(sum(np.power(list(set(v1)-set(v2)), 2))) for v1,v2 in zip(t_p_0, t_p_1)] 
    
    
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['molecule_dist_std'] = df.groupby('molecule_name')['dist'].transform('std')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    df['molecule_angle_mean'] = df.groupby('molecule_name')['Angle'].transform('mean')
    df['molecule_angle_min'] = df.groupby('molecule_name')['Angle'].transform('min')
    df['molecule_angle_max'] = df.groupby('molecule_name')['Angle'].transform('max')
    df['molecule_dist_std'] = df.groupby('molecule_name')['Angle'].transform('std')
    
    # stats about chemical bonds bond 
    df['sum_bond_length_by_id'] = df['bond_lengths_mean_1'] + df['bond_lengths_mean_0']
    df['EN_sum_by_id'] = df['EN_0'] + df['EN_1']  
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    
    # Number of bonds in one molecule type
    df[f'number_of_bonds_molecule_type'] = df.groupby(['molecule_name', 'type'])['id'].transform('count')
    df[f'atom_1_couples_type_count'] = df.groupby(['molecule_name', 'type','atom_index_1'])['id'].transform('count')
    df[f'atom_0_couples_type_count'] = df.groupby(['molecule_name', 'type','atom_index_0'])['id'].transform('count')
    
    # Count number of different atoms in the molecule by type
    y = df[['molecule_name', 'type', 'atom_index_0', 'atom_0']].rename(columns={'atom_index_0': 'atom_index', 'atom_0' : 'atom'})
    x = df[['molecule_name', 'type', 'atom_index_1', 'atom_1']].rename(columns={'atom_index_1': 'atom_index', 'atom_1' : 'atom'})
    xy = pd.concat([y,x])
    xy_by_atom = xy.groupby(['molecule_name', 'type', 'atom_index','atom']).count().reset_index()
    xy_by_atom = xy_by_atom.groupby(['molecule_name', 'type', 'atom'])['atom_index'].count().reset_index()
    xy_by_atom = pd.pivot_table(xy_by_atom, values='atom_index', index=['molecule_name', 'type'], columns=['atom'])
    df = df.merge(xy_by_atom, on = ['molecule_name', 'type'])
    # Total number of atoms in molecule by type
    xy_agg = xy.groupby(['molecule_name', 'type', 'atom_index','atom']).count().reset_index()
    xy_agg = xy_agg.groupby(['molecule_name', 'type'])['atom_index'].count().reset_index().rename(columns = {'atom_index':'atom_count'})
    df = df.merge(xy_agg, on = ['molecule_name', 'type'])
    
    # Sum up electronegativity, bond length
    df['C'].fillna(0, inplace=True)
    df['H'].fillna(0, inplace=True)
    df['N'].fillna(0, inplace=True)
    df['sum_electronegat_by_molecule'] = df['C'] * 2.55 + df['H'] * 2.2 + df['N'] * 3.04
    
    # Ratios with molecule features
    #df['en_0_over_poreVol'] = df['EN_0']/df['poreVol']
    #df['en_1_over_poreVol'] = df['EN_1']/df['poreVol']
    #df['rad_0_over_poreVol'] = df['rad_0']/df['poreVol']
    #df['rad_1_over_poreVol'] = df['rad_1']/df['poreVol']
    
    #df['dist_over_poreVol'] = df['dist']/df['poreVol']
    #df['dist_x_over_poreVol'] = df['dist_x']/df['poreVol']
    #df['dist_y_over_poreVol'] = df['dist_y']/df['poreVol']
    #df['dist_z_over_poreVol'] = df['dist_z']/df['poreVol']
    
    #df['dot_prod_coord_over_poreVol'] = df['dot_product_coordinates']/df['poreVol']
    #df['bond_lengths_mean_0_over_poreVol'] = df['bond_lengths_mean_0']/df['poreVol']
    #df['bond_lengths_mean_1_over_poreVol'] = df['bond_lengths_mean_1']/df['poreVol']
    #df['bond_lengths_std_0_over_poreVol'] = df['bond_lengths_std_0']/df['poreVol']
    #df['bond_lengths_std_1_over_poreVol'] = df['bond_lengths_std_1']/df['poreVol']
    #df['Gradient_0_over_poreVol'] = df['Gradient_0']/df['poreVol']
    #df['Gradient_1_over_poreVol'] = df['Gradient_1']/df['poreVol']
    
    
    
    #df = reduce_mem_usage(df)
    return df

In [None]:
def map_closest_atom_info(df_1,df_2, atom_idx):
    df = pd.merge(df_1, df_2, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)

    return df



def create_closest(df):
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","dist","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp=pd.concat(objs=[df_temp,df_temp_],axis=0)

    df_temp["min_distance"]=df_temp.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df_temp= df_temp[df_temp["min_distance"]==df_temp["dist"]]

    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                     'atom_index_1': 'atom_index_closest',
                                     'distance': 'distance_closest',
                                     'x_1': 'x_closest',
                                     'y_1': 'y_closest',
                                     'z_1': 'z_closest'})
     
    for atom_idx in [0,1]:
        df = map_closest_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                            'distance_closest': f'distance_closest_{atom_idx}',
                                            'x_closest': f'x_closest_{atom_idx}',
                                            'y_closest': f'y_closest_{atom_idx}',
                                            'z_closest': f'z_closest_{atom_idx}'})
    return df

def add_cos_features(df):
    
    df["distance_0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["vec_0_x"]=(df['x_0']-df['x_closest_0'])/df["distance_0"]
    df["vec_0_y"]=(df['y_0']-df['y_closest_0'])/df["distance_0"]
    df["vec_0_z"]=(df['z_0']-df['z_closest_0'])/df["distance_0"]
    df["vec_1_x"]=(df['x_1']-df['x_closest_1'])/df["distance_1"]
    df["vec_1_y"]=(df['y_1']-df['y_closest_1'])/df["distance_1"]
    df["vec_1_z"]=(df['z_1']-df['z_closest_1'])/df["distance_1"]
    df["vec_x"]=(df['x_1']-df['x_0'])/df["dist"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["dist"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["dist"]
    df["cos_0_1"]=df["vec_0_x"]*df["vec_1_x"]+df["vec_0_y"]*df["vec_1_y"]+df["vec_0_z"]*df["vec_1_z"]
    df["cos_0"]=df["vec_0_x"]*df["vec_x"]+df["vec_0_y"]*df["vec_y"]+df["vec_0_z"]*df["vec_z"]
    df["cos_1"]=df["vec_1_x"]*df["vec_x"]+df["vec_1_y"]*df["vec_y"]+df["vec_1_z"]*df["vec_z"]
    df=df.drop(['vec_0_x','vec_0_y','vec_0_z','vec_1_x','vec_1_y','vec_1_z','vec_x','vec_y','vec_z'], axis=1)
    
    return df

In [None]:
# Prepare structures
structures = prepare_structures(structures)

#Map structures data
df_train = map_atom_info(df, 0)
df_train = map_atom_info(df_train, 1)
df_test = map_atom_info(df_test, 0)
df_test = map_atom_info(df_test, 1)

#Map charges
df_train = map_atom_charges(df_train, ob_charge_train, 0)
df_train = map_atom_charges(df_train, ob_charge_train, 1)
df_test = map_atom_charges(df_test, ob_charge_test, 0)
df_test = map_atom_charges(df_test, ob_charge_test, 1)

# # Calculate distance between atoms
# train_p_0 = df_train[['x_0', 'y_0', 'z_0']].values
# train_p_1 = df_train[['x_1', 'y_1', 'z_1']].values
# test_p_0 = df_test[['x_0', 'y_0', 'z_0']].values
# test_p_1 = df_test[['x_1', 'y_1', 'z_1']].values

    
# df_train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
# df_test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
# df_train['dist_x'] = (df_train['x_0'] - df_train['x_1']) ** 2
# df_test['dist_x'] = (df_test['x_0'] - df_test['x_1']) ** 2
# df_train['dist_y'] = (df_train['y_0'] - df_train['y_1']) ** 2
# df_test['dist_y'] = (df_test['y_0'] - df_test['y_1']) ** 2
# df_train['dist_z'] = (df_train['z_0'] - df_train['z_1']) ** 2
# df_test['dist_z'] = (df_test['z_0'] - df_test['z_1']) ** 2

# df_train['type_0'] = df_train['type'].apply(lambda x: x[0])
# df_test['type_0'] = df_test['type'].apply(lambda x: x[0])

# df_train['Angle']  = [np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) for v1,v2 in zip(train_p_0, train_p_1)]
# df_test['Angle']  = [np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) for v1,v2 in zip(test_p_0, test_p_1)]

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train = create_features(df_train)
df_test = create_features(df_test)

df_train = create_closest(df_train)
df_test = create_closest(df_test)

df_train = add_cos_features(df_train)
df_test = add_cos_features(df_test)

In [None]:
#df_train = df_train.merge(contribut[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'fc']], how = 'left', on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [None]:
# Encode Categorical Variables
for f in ['type_0', 'atom_0', 'atom_1', 'bond_atom', 'tertiary_atom_0', 
         'tertiary_atom_1', 'tertiary_atom_2', 'tertiary_atom_3', 'tertiary_atom_4', 'tertiary_atom_5', 'tertiary_atom_6', 'tertiary_atom_7',
         'tertiary_atom_8', 'tertiary_atom_9', 'tertiary_atom_10', 'tertiary_atom_11', 'tertiary_atom_12', 'tertiary_atom_13', 'tertiary_atom_14', 
         'tertiary_atom_15', 'tertiary_atom_16', 'tertiary_atom_17', 'tertiary_atom_18', 'tertiary_atom_19', 'tertiary_atom_20', 'tertiary_atom_21', 
         'tertiary_atom_22', 'tertiary_atom_23', 'tertiary_atom_24', 'tertiary_atom_25', 'tertiary_atom_26']:
    lbl = LabelEncoder()
    lbl.fit(list(df_train[f].values) + list(df_train[f].values))
    df_train[f] = lbl.transform(list(df_train[f].values))
    df_test[f] = lbl.transform(list(df_test[f].values))

In [None]:
#X = df_train.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
#Y = df_train['scalar_coupling_constant']
#X_test = df_test.drop(['id', 'molecule_name'], axis=1)

In [None]:
# def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
#     maes = (y_true-y_pred).abs().groupby(types).mean()
#     totlog = np.log(maes)
#     return totlog.mean()

In [None]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [67]:
# Model Params
params = {'num_leaves': 128,
          'min_child_samples': 50,
          'min_data_in_leaf' : 20,  
          'objective': 'huber',
          'n_estimators': 5000,
          'max_depth': 9,
          'learning_rate': 0.1,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

In [23]:
# Parameters from Kernel for 0.92
# params = {'num_leaves': 50,
#           'min_child_samples': 79,
#           'min_data_in_leaf': 100,
#           'objective': 'regression',
#           'max_depth': 9,
#           'learning_rate': 0.2,
#           "boosting_type": "gbdt",
#           "subsample_freq": 1,
#           "subsample": 0.9,
#           "bagging_seed": 11,
#           "metric": 'mae',
#           "verbosity": -1,
#           'reg_alpha': 0.1,
#           'reg_lambda': 0.3,
#           'colsample_bytree': 1.0
#          }

In [24]:
#dict_score = dict()
#submission_list = list()

In [25]:
gc.collect()

142

In [26]:
#df_test['Angle'].describe().apply(lambda x: format(x, 'f'))
#df_train = df_train.merge(contribut[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'sd_pso_dso']], how = 'left', on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [27]:
# Predict SD & PSO & DSO - a contributor to the coupling scalar 

# for moltype in df_train['type'].unique():
#     df_train_type = df_train[df_train.type == moltype].reset_index(drop = True)
#     df_test_type = df_test[df_test.type == moltype].reset_index(drop = True)
#     #df_train_type = df_train_type.drop(['type'],  axis=1)
#     #df_test_type = df_test_type.drop(['type'],  axis=1)
#     X = df_train_type.drop(['id', 'molecule_name', 'type','scalar_coupling_constant', 'sd_pso_dso'], axis=1)
#     Y = df_train_type['sd_pso_dso']
#     X_test = df_test_type.drop(['id', 'molecule_name', 'type'], axis=1)
    
#     print(moltype)
    
#     scores = []
#     for i, (train_index, test_index) in enumerate(kf.split(df_train_type)):

#         # Create data for this fold
#         Y_train, Y_valid = Y.iloc[train_index].copy(), Y.iloc[test_index].copy()
#         X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        
#         print( f'\nFold {moltype}: {i}')

#         fit_model = lgb.LGBMRegressor(**params)
#         fit_model.fit(X_train, Y_train)
#         pred = fit_model.predict(X_valid)
#         # Save validation predictions for this fold
#         print( "Group Log MAE: ", math.log(mean_absolute_error(Y_valid, pred)))
#         scores.append(math.log(mean_absolute_error(Y_valid, pred)))

#     print(f'The mean score of a model for {moltype} is: {np.mean(scores)}')
#     dict_score[moltype] = np.mean(scores)
    
#     submit_pred = fit_model.predict(X_test)
#     submit_pred= pd.DataFrame(submit_pred)
#     submit_pred.columns = ['sd_pso_dso']
#     sub = pd.concat([df_test_type[['molecule_name', 'atom_index_0', 'atom_index_1', 'type']], submit_pred], axis = 1)
#     submission_list.append(sub)
    
# print("Overall mean is ", np.array(list(dict_score.values())).mean())
    

In [28]:
#test_pred_dso = pd.concat(submission_list)
#df_test = df_test.merge(test_pred_dso, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [29]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,centrMass1,centrMass2,centrMass3,maxDiam,aveDiam,poreDiam,poreVol,poreDiamOpt,poreVolOpt,atom_0_hybrid,atom_0_mass,atom_0_partcharge,atom_0_valence,atom_1_hybrid,atom_1_mass,atom_1_partcharge,atom_1_valence,bond_atom,bond_distance,tertiary_angle_0,tertiary_angle_1,tertiary_angle_10,tertiary_angle_11,tertiary_angle_12,tertiary_angle_13,tertiary_angle_14,tertiary_angle_15,tertiary_angle_16,tertiary_angle_17,tertiary_angle_18,tertiary_angle_19,tertiary_angle_2,tertiary_angle_20,tertiary_angle_21,tertiary_angle_22,tertiary_angle_23,tertiary_angle_24,tertiary_angle_25,tertiary_angle_26,tertiary_angle_3,tertiary_angle_4,tertiary_angle_5,tertiary_angle_6,tertiary_angle_7,tertiary_angle_8,tertiary_angle_9,tertiary_atom_0,tertiary_atom_1,tertiary_atom_10,tertiary_atom_11,tertiary_atom_12,tertiary_atom_13,tertiary_atom_14,tertiary_atom_15,tertiary_atom_16,tertiary_atom_17,tertiary_atom_18,tertiary_atom_19,tertiary_atom_2,tertiary_atom_20,tertiary_atom_21,tertiary_atom_22,tertiary_atom_23,tertiary_atom_24,tertiary_atom_25,tertiary_atom_26,tertiary_atom_3,tertiary_atom_4,tertiary_atom_5,tertiary_atom_6,tertiary_atom_7,tertiary_atom_8,tertiary_atom_9,tertiary_distance_0,tertiary_distance_1,tertiary_distance_10,tertiary_distance_11,tertiary_distance_12,tertiary_distance_13,tertiary_distance_14,tertiary_distance_15,tertiary_distance_16,tertiary_distance_17,tertiary_distance_18,tertiary_distance_19,tertiary_distance_2,tertiary_distance_20,tertiary_distance_21,tertiary_distance_22,tertiary_distance_23,tertiary_distance_24,tertiary_distance_25,tertiary_distance_26,tertiary_distance_3,tertiary_distance_4,tertiary_distance_5,tertiary_distance_6,tertiary_distance_7,tertiary_distance_8,tertiary_distance_9,totalatoms,molecule_dimension,molecule_energy,molecule_exact_mass,molecule_total_charge,molecule_total_spin_mult,molecule_wt,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,dipole_eem2015ha_y,dipole_eem2015ha_z,dipole_eem2015hm_x,dipole_eem2015hm_y,dipole_eem2015hm_z,dipole_eem2015hn_x,dipole_eem2015hn_y,dipole_eem2015hn_z,dipole_eem_x,dipole_eem_y,dipole_eem_z,dipole_gasteiger_x,dipole_gasteiger_y,dipole_gasteiger_z,dipole_mmff94_x,dipole_mmff94_y,dipole_mmff94_z,dipole_qeq_x,dipole_qeq_y,dipole_qeq_z,dipole_qtpie_x,dipole_qtpie_y,dipole_qtpie_z,moment_eem,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie,atom_0,x_0,y_0,z_0,EN_0,rad_0,n_bonds_0,bond_lengths_mean_0,bond_lengths_std_0,Gradient_0,atom_1,x_1,y_1,z_1,EN_1,rad_1,n_bonds_1,bond_lengths_mean_1,bond_lengths_std_1,Gradient_1,eem_charge_0,mmff94_charge_0,gasteiger_charge_0,qeq_charge_0,qtpie_charge_0,eem2015ha_charge_0,eem2015hm_charge_0,eem2015hn_charge_0,eem2015ba_charge_0,eem2015bm_charge_0,eem2015bn_charge_0,eem_charge_1,mmff94_charge_1,gasteiger_charge_1,qeq_charge_1,qtpie_charge_1,eem2015ha_charge_1,eem2015hm_charge_1,eem2015hn_charge_1,eem2015ba_charge_1,eem2015bm_charge_1,eem2015bn_charge_1,dist_x,dist_x.1,dist_y,dist_z,type_0,Angle,dot_product_coordinates,euclidean_length,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,molecule_dist_std,atom_0_couples_count,atom_1_couples_count,molecule_angle_mean,molecule_angle_min,molecule_angle_max,sum_bond_length_by_id,EN_sum_by_id,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,number_of_bonds_molecule_type,atom_1_couples_type_count,atom_0_couples_type_count,C,H,N,atom_count,sum_electronegat_by_molecule,atom_index_closest_0,dist_y.1,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.808,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.002,-0.006,0.002,2.2,0.43,1,1.092,0.0,-0.0,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.0,1.192,0.0,0,2.694,-0.007,0.007,10,1.507,1.092,1.783,0.912,4,4,1.509,0.617,2.924,2.184,4.75,0.728,1.359,0.273,1.251,1.464,0.378,0.182,0.728,1.61,0.518,1.475,1.783,0.691,1.633,1.092,0.0,1.0,0.346,-0.746,0.316,1.092,-0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,-0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
1,4,dsgdb9nsd_000001,2,0,1JHC,84.807,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.012,1.464,0.0,2.2,0.43,1,1.092,0.0,-1.517,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,1.049,0.143,0.0,0,0.617,1.577,1.779,10,1.507,1.092,1.783,0.912,3,4,1.509,0.617,2.924,2.184,4.75,0.3,1.324,0.238,1.219,1.448,0.362,0.206,0.892,1.553,0.461,1.422,1.783,0.691,1.633,1.092,0.0,1.0,0.399,-0.693,0.365,1.092,-0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,-0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
2,7,dsgdb9nsd_000001,3,0,1JHC,84.809,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.541,1.448,-0.877,2.2,0.43,1,1.092,0.0,-0.504,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.279,0.131,0.783,0,0.619,1.572,1.777,10,1.507,1.092,1.783,0.912,2,4,1.509,0.617,2.924,2.184,4.75,0.361,1.262,0.176,1.162,1.438,0.352,0.249,0.635,1.438,0.346,1.316,1.783,0.691,1.633,1.092,0.0,1.0,0.489,-0.603,0.448,1.092,0.0,1.0,1.092,0.0,1.0,1.092,0.0,1.0,0.0,-1.092,0.0,1.092,1.092,0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,-1.0,-1.0,1.0
3,9,dsgdb9nsd_000001,4,0,1JHC,84.809,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.524,1.438,0.906,2.2,0.43,1,1.092,0.0,2.145,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.261,0.124,0.807,0,0.617,1.575,1.779,10,1.507,1.092,1.783,0.912,1,4,1.509,0.617,2.924,2.184,4.75,,1.086,0.0,1.0,1.086,0.0,,,1.092,0.0,1.0,1.092,0.0,1.0,1.092,0.0,1.0,,,,1.092,0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,0,1.008,0.019,1,6,1.783,0.616,1.047,,,,,,,,,,,1.047,,,,,,,,,,,,,,,3,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.092,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.002,-0.006,0.002,2.2,0.43,1,1.092,0.0,-0.0,1,1.012,1.464,0.0,2.2,0.43,1,1.092,0.0,-1.517,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,1.783,1.019,2.16,0.0,1,2.163,-0.007,0.007,10,1.507,1.092,1.783,0.912,4,1,1.509,0.617,2.924,2.184,4.4,0.728,1.359,-0.105,0.928,1.464,0.0,0.182,0.728,1.61,-0.173,0.903,1.783,0.0,1.0,1.092,-0.691,0.612,0.346,-1.438,0.194,1.783,0.0,1.0,1.783,0.0,1.0,1.783,0.0,1.0,,,,1.783,1.783,0.0,1.0,0.0,-1.783,0.0,-1.783,1.783,0.0,1.0,1.783,1.783,0.0,-1.783,6,1,3,0.0,4.0,0.0,4,8.8,0,1.092,-0.013,1.086,0.008,0,1.092,-0.013,1.086,0.008,1.092,1.092,-0.333,-1.333,1.333


In [30]:
#test_pred_dso.to_csv('df_test_sd_pso_dso.csv', index = False)

In [31]:
# Predict FC - a contributor to the coupling scalar 

# dict_score = dict()
# submission_list = list()

# for moltype in df_train['type'].unique():
#     df_train_type = df_train[df_train.type == moltype].reset_index(drop = True)
#     df_test_type = df_test[df_test.type == moltype].reset_index(drop = True)
#     #df_train_type = df_train_type.drop(['type'],  axis=1)
#     #df_test_type = df_test_type.drop(['type'],  axis=1)
#     X = df_train_type.drop(['id', 'molecule_name', 'type','scalar_coupling_constant', 'fc'], axis=1)
#     Y = df_train_type['fc']
#     X_test = df_test_type.drop(['id', 'molecule_name', 'type'], axis=1)
    
#     print(moltype)
    
#     scores = []
#     for i, (train_index, test_index) in enumerate(kf.split(df_train_type)):

#         # Create data for this fold
#         Y_train, Y_valid = Y.iloc[train_index].copy(), Y.iloc[test_index].copy()
#         X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        
#         print( f'\nFold {moltype}: {i}')

#         fit_model = lgb.LGBMRegressor(**params)
#         fit_model.fit(X_train, Y_train)
#         pred = fit_model.predict(X_valid)
#         # Save validation predictions for this fold
#         print( "Group Log MAE: ", math.log(mean_absolute_error(Y_valid, pred)))
#         scores.append(math.log(mean_absolute_error(Y_valid, pred)))

#     print(f'The mean score of a model for {moltype} is: {np.mean(scores)}')
#     dict_score[moltype] = np.mean(scores)
    
#     submit_pred = fit_model.predict(X_test)
#     submit_pred= pd.DataFrame(submit_pred)
#     submit_pred.columns = ['fc']
#     sub = pd.concat([df_test_type[['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type']], submit_pred], axis = 1)
#     submission_list.append(sub)
    
# print("Overall mean is ", np.array(list(dict_score.values())).mean())


In [32]:
# Join the predicted DSOs to the test data set
#test_pred_dso = pd.read_csv('df_test_sd_pso_dso.csv')
#df_test = df_test.merge(test_pred_dso, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
#test_pred_dso = pd.concat(submission_list)

In [33]:
#fc_predicted = pd.concat(submission_list)

In [34]:
#fc_predicted.to_csv('fc_predicted.csv', index = False)
#fc_predicted = pd.read_csv('fc_predicted.csv')

In [None]:
#df_test = df_test.merge(fc_predicted, on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

In [None]:
#df_test[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'sd_pso_dso']].to_csv('df_test_sd_pso_dso.csv', index = False)
df_test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,centrMass1,centrMass2,centrMass3,maxDiam,aveDiam,poreDiam,poreVol,poreDiamOpt,poreVolOpt,atom_0_hybrid,atom_0_mass,atom_0_partcharge,atom_0_valence,atom_1_hybrid,atom_1_mass,atom_1_partcharge,atom_1_valence,bond_atom,bond_distance,tertiary_angle_0,tertiary_angle_1,tertiary_angle_10,tertiary_angle_11,tertiary_angle_12,tertiary_angle_13,tertiary_angle_14,tertiary_angle_15,tertiary_angle_16,tertiary_angle_17,tertiary_angle_18,tertiary_angle_19,tertiary_angle_2,tertiary_angle_20,tertiary_angle_21,tertiary_angle_22,tertiary_angle_23,tertiary_angle_24,tertiary_angle_25,tertiary_angle_26,tertiary_angle_3,tertiary_angle_4,tertiary_angle_5,tertiary_angle_6,tertiary_angle_7,tertiary_angle_8,tertiary_angle_9,tertiary_atom_0,tertiary_atom_1,tertiary_atom_10,tertiary_atom_11,tertiary_atom_12,tertiary_atom_13,tertiary_atom_14,tertiary_atom_15,tertiary_atom_16,tertiary_atom_17,tertiary_atom_18,tertiary_atom_19,tertiary_atom_2,tertiary_atom_20,tertiary_atom_21,tertiary_atom_22,tertiary_atom_23,tertiary_atom_24,tertiary_atom_25,tertiary_atom_26,tertiary_atom_3,tertiary_atom_4,tertiary_atom_5,tertiary_atom_6,tertiary_atom_7,tertiary_atom_8,tertiary_atom_9,tertiary_distance_0,tertiary_distance_1,tertiary_distance_10,tertiary_distance_11,tertiary_distance_12,tertiary_distance_13,tertiary_distance_14,tertiary_distance_15,tertiary_distance_16,tertiary_distance_17,tertiary_distance_18,tertiary_distance_19,tertiary_distance_2,tertiary_distance_20,tertiary_distance_21,tertiary_distance_22,tertiary_distance_23,tertiary_distance_24,tertiary_distance_25,tertiary_distance_26,tertiary_distance_3,tertiary_distance_4,tertiary_distance_5,tertiary_distance_6,tertiary_distance_7,tertiary_distance_8,tertiary_distance_9,totalatoms,molecule_dimension,molecule_energy,molecule_exact_mass,molecule_total_charge,molecule_total_spin_mult,molecule_wt,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,dipole_eem2015ha_y,dipole_eem2015ha_z,dipole_eem2015hm_x,dipole_eem2015hm_y,dipole_eem2015hm_z,dipole_eem2015hn_x,dipole_eem2015hn_y,dipole_eem2015hn_z,dipole_eem_x,dipole_eem_y,dipole_eem_z,dipole_gasteiger_x,dipole_gasteiger_y,dipole_gasteiger_z,dipole_mmff94_x,dipole_mmff94_y,dipole_mmff94_z,dipole_qeq_x,dipole_qeq_y,dipole_qeq_z,dipole_qtpie_x,dipole_qtpie_y,dipole_qtpie_z,moment_eem,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie,atom_0,x_0,y_0,z_0,EN_0,rad_0,n_bonds_0,bond_lengths_mean_0,bond_lengths_std_0,Gradient_0,atom_1,x_1,y_1,z_1,EN_1,rad_1,n_bonds_1,bond_lengths_mean_1,bond_lengths_std_1,Gradient_1,eem_charge_0,mmff94_charge_0,gasteiger_charge_0,qeq_charge_0,qtpie_charge_0,eem2015ha_charge_0,eem2015hm_charge_0,eem2015hn_charge_0,eem2015ba_charge_0,eem2015bm_charge_0,eem2015bn_charge_0,eem_charge_1,mmff94_charge_1,gasteiger_charge_1,qeq_charge_1,qtpie_charge_1,eem2015ha_charge_1,eem2015hm_charge_1,eem2015hn_charge_1,eem2015ba_charge_1,eem2015bm_charge_1,eem2015bn_charge_1,dist_x,dist_x.1,dist_y,dist_z,type_0,Angle,dot_product_coordinates,euclidean_length,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,molecule_dist_std,atom_0_couples_count,atom_1_couples_count,molecule_angle_mean,molecule_angle_min,molecule_angle_max,sum_bond_length_by_id,EN_sum_by_id,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,number_of_bonds_molecule_type,atom_1_couples_type_count,atom_0_couples_type_count,C,H,N,atom_count,sum_electronegat_by_molecule,atom_index_closest_0,dist_y.1,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,0.0,0.0,1.0,5.503,3.912,-2.201,-5.582,-2.201,-5.582,0,1.008,0.123,1,1,12.011,-0.123,2,1,2.261,0.0,3.141,,,,,,,,,,,,,,,,,,,,,,,,,,1,7,22,21,21,21,19,19,19,18,14,12,21,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.062,3.323,,,,,,,,,,,,,,,,,,,,,,,,,,4,3,0.0,26.016,0,1,26.037,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,3.992,0,0.6,0.0,1.0,2.55,0.82,2,1.131,0.068,0.601,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,-0.14,-0.177,-0.195,0.162,-0.105,0.187,-0.245,-0.126,0.188,-0.233,-0.126,2.261,5.113,0.0,0.0,1,1.569,0.004,1.662,5,1.994,1.062,3.323,0.71,3,2,1.235,0.489,2.058,2.193,4.75,1.131,0.0,0.0,,0.0,0.0,0.0,0.0,2.216,-0.046,0.98,3.323,1.062,1.47,1.062,-1.199,0.47,1.131,-1.13,0.5,1.662,-0.6,0.735,2.261,0.0,1.0,1.062,-1.199,0.47,0.848,-1.413,0.375,1.662,1.062,-1.199,0.47,0.692,-1.569,0.0,-2.261,2.261,0.0,1.0,2.261,2.261,0.0,-2.261,2,1,1,2.0,2.0,0.0,4,9.5,1,1.062,-0.6,0.0,1.0,3,1.062,1.662,0.0,1.0,1.062,1.062,1.0,-2.129,-2.129
1,4658151,dsgdb9nsd_000004,3,1,2JHC,0.0,0.0,1.0,5.503,3.912,-2.201,-5.582,-2.201,-5.582,0,1.008,0.123,1,1,12.011,-0.123,2,1,2.261,0.0,3.141,,,,,,,,,,,,,,,,,,,,,,,,,,1,7,22,21,21,21,19,19,19,18,14,12,21,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.062,3.323,,,,,,,,,,,,,,,,,,,,,,,,,,4,3,0.0,26.016,0,1,26.037,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,-0.992,0,-0.6,0.0,1.0,2.55,0.82,2,1.131,0.068,2.399,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,-0.14,-0.177,-0.195,0.162,-0.105,0.187,-0.245,-0.126,0.188,-0.233,-0.126,2.261,5.113,0.0,0.0,1,1.569,0.004,1.662,5,1.994,1.062,3.323,0.71,2,2,1.235,0.489,2.058,2.193,4.75,0.848,0.0,0.0,,0.0,0.0,0.0,0.0,1.662,-0.6,0.735,2.261,0.0,1.0,1.062,-1.199,0.47,0.848,-1.413,0.375,1.662,-0.6,0.735,2.261,0.0,1.0,1.062,-1.199,0.47,0.848,-1.413,0.375,1.662,1.062,-1.199,0.47,0.692,-1.569,0.0,-2.261,2.261,0.0,1.0,2.261,2.261,0.0,-2.261,2,1,1,2.0,2.0,0.0,4,9.5,0,1.062,0.6,0.0,1.0,2,1.062,-1.662,0.0,1.0,1.062,1.062,1.0,-2.129,-2.129
2,4658148,dsgdb9nsd_000004,2,1,1JHC,0.0,0.0,1.0,5.503,3.912,-2.201,-5.582,-2.201,-5.582,0,1.008,0.123,1,1,12.011,-0.123,2,1,1.062,3.141,3.141,,,,,,,,,,,,,,,,,,,,,,,,,,1,7,22,21,21,21,19,19,19,18,14,12,21,10,9,5,4,3,2,1,22,22,22,22,22,22,22,2.261,3.323,,,,,,,,,,,,,,,,,,,,,,,,,,4,3,0.0,26.016,0,1,26.037,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,3.992,0,-0.6,0.0,1.0,2.55,0.82,2,1.131,0.068,2.399,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,-0.14,-0.177,-0.195,0.162,-0.105,0.187,-0.245,-0.126,0.188,-0.233,-0.126,1.062,1.128,0.0,0.0,0,0.489,1.996,1.662,5,1.994,1.062,3.323,0.71,3,2,1.235,0.489,2.058,2.193,4.75,1.131,0.0,0.0,,0.0,0.0,0.0,0.0,2.216,1.153,2.086,3.323,2.261,3.129,1.062,0.0,1.0,1.131,0.069,1.065,1.662,0.6,1.564,2.261,1.199,2.129,1.062,0.0,1.0,0.848,-0.214,0.798,1.662,1.062,0.0,1.0,0.692,-0.37,0.0,-1.062,1.062,0.0,1.0,1.062,1.062,0.0,-1.062,2,1,1,2.0,2.0,0.0,4,9.5,1,1.062,-0.6,0.0,1.0,2,1.062,-1.662,0.0,1.0,1.062,1.062,-1.0,-1.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,0.0,0.0,1.0,5.503,3.912,-2.201,-5.582,-2.201,-5.582,0,1.008,0.123,1,1,12.011,-0.123,2,1,1.062,3.141,3.141,,,,,,,,,,,,,,,,,,,,,,,,,,1,7,22,21,21,21,19,19,19,18,14,12,21,10,9,5,4,3,2,1,22,22,22,22,22,22,22,2.261,3.323,,,,,,,,,,,,,,,,,,,,,,,,,,4,3,0.0,26.016,0,1,26.037,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,-0.992,0,0.6,0.0,1.0,2.55,0.82,2,1.131,0.068,0.601,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,-0.14,-0.177,-0.195,0.162,-0.105,0.187,-0.245,-0.126,0.188,-0.233,-0.126,1.062,1.128,0.0,0.0,0,0.489,1.996,1.662,5,1.994,1.062,3.323,0.71,2,2,1.235,0.489,2.058,2.193,4.75,0.848,0.0,0.0,,0.0,0.0,0.0,0.0,1.662,0.6,1.564,2.261,1.199,2.129,1.062,0.0,1.0,0.848,-0.214,0.798,1.662,0.6,1.564,2.261,1.199,2.129,1.062,0.0,1.0,0.848,-0.214,0.798,1.662,1.062,0.0,1.0,0.692,-0.37,0.0,-1.062,1.062,0.0,1.0,1.062,1.062,0.0,-1.062,2,1,1,2.0,2.0,0.0,4,9.5,0,1.062,0.6,0.0,1.0,3,1.062,1.662,0.0,1.0,1.062,1.062,-1.0,-1.0,1.0
4,4658149,dsgdb9nsd_000004,2,3,3JHH,0.0,0.0,1.0,5.503,3.912,-2.201,-5.582,-2.201,-5.582,0,1.008,0.123,1,0,1.008,0.123,1,6,3.323,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,1,1,22,21,21,21,19,19,19,18,14,12,21,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.062,2.261,,,,,,,,,,,,,,,,,,,,,,,,,,4,3,0.0,26.016,0,1,26.037,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,3.992,1,1.662,0.0,1.0,2.2,0.43,1,1.062,0.0,-0.992,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,0.14,0.177,0.195,-0.162,0.105,-0.187,0.245,0.126,-0.188,0.233,0.126,3.323,11.044,0.0,0.0,2,2.058,-1.761,1.662,5,1.994,1.062,3.323,0.71,3,1,1.235,0.489,2.058,2.124,4.4,1.131,0.0,0.0,,0.0,0.0,0.0,0.0,2.216,-1.108,0.667,3.323,0.0,1.0,1.062,-2.261,0.32,1.131,-2.192,0.34,3.323,0.0,1.0,3.323,0.0,1.0,3.323,0.0,1.0,,,,3.323,3.323,0.0,1.0,,,,,3.323,0.0,1.0,3.323,3.323,,,1,1,1,0.0,2.0,0.0,2,4.4,1,1.062,-0.6,0.0,1.0,0,1.062,0.6,0.0,1.0,1.062,1.062,-1.0,-3.129,3.129


In [None]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,centrMass1,centrMass2,centrMass3,maxDiam,aveDiam,poreDiam,poreVol,poreDiamOpt,poreVolOpt,atom_0_hybrid,atom_0_mass,atom_0_partcharge,atom_0_valence,atom_1_hybrid,atom_1_mass,atom_1_partcharge,atom_1_valence,bond_atom,bond_distance,tertiary_angle_0,tertiary_angle_1,tertiary_angle_10,tertiary_angle_11,tertiary_angle_12,tertiary_angle_13,tertiary_angle_14,tertiary_angle_15,tertiary_angle_16,tertiary_angle_17,tertiary_angle_18,tertiary_angle_19,tertiary_angle_2,tertiary_angle_20,tertiary_angle_21,tertiary_angle_22,tertiary_angle_23,tertiary_angle_24,tertiary_angle_25,tertiary_angle_26,tertiary_angle_3,tertiary_angle_4,tertiary_angle_5,tertiary_angle_6,tertiary_angle_7,tertiary_angle_8,tertiary_angle_9,tertiary_atom_0,tertiary_atom_1,tertiary_atom_10,tertiary_atom_11,tertiary_atom_12,tertiary_atom_13,tertiary_atom_14,tertiary_atom_15,tertiary_atom_16,tertiary_atom_17,tertiary_atom_18,tertiary_atom_19,tertiary_atom_2,tertiary_atom_20,tertiary_atom_21,tertiary_atom_22,tertiary_atom_23,tertiary_atom_24,tertiary_atom_25,tertiary_atom_26,tertiary_atom_3,tertiary_atom_4,tertiary_atom_5,tertiary_atom_6,tertiary_atom_7,tertiary_atom_8,tertiary_atom_9,tertiary_distance_0,tertiary_distance_1,tertiary_distance_10,tertiary_distance_11,tertiary_distance_12,tertiary_distance_13,tertiary_distance_14,tertiary_distance_15,tertiary_distance_16,tertiary_distance_17,tertiary_distance_18,tertiary_distance_19,tertiary_distance_2,tertiary_distance_20,tertiary_distance_21,tertiary_distance_22,tertiary_distance_23,tertiary_distance_24,tertiary_distance_25,tertiary_distance_26,tertiary_distance_3,tertiary_distance_4,tertiary_distance_5,tertiary_distance_6,tertiary_distance_7,tertiary_distance_8,tertiary_distance_9,totalatoms,molecule_dimension,molecule_energy,molecule_exact_mass,molecule_total_charge,molecule_total_spin_mult,molecule_wt,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,dipole_eem2015ha_y,dipole_eem2015ha_z,dipole_eem2015hm_x,dipole_eem2015hm_y,dipole_eem2015hm_z,dipole_eem2015hn_x,dipole_eem2015hn_y,dipole_eem2015hn_z,dipole_eem_x,dipole_eem_y,dipole_eem_z,dipole_gasteiger_x,dipole_gasteiger_y,dipole_gasteiger_z,dipole_mmff94_x,dipole_mmff94_y,dipole_mmff94_z,dipole_qeq_x,dipole_qeq_y,dipole_qeq_z,dipole_qtpie_x,dipole_qtpie_y,dipole_qtpie_z,moment_eem,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie,atom_0,x_0,y_0,z_0,EN_0,rad_0,n_bonds_0,bond_lengths_mean_0,bond_lengths_std_0,Gradient_0,atom_1,x_1,y_1,z_1,EN_1,rad_1,n_bonds_1,bond_lengths_mean_1,bond_lengths_std_1,Gradient_1,eem_charge_0,mmff94_charge_0,gasteiger_charge_0,qeq_charge_0,qtpie_charge_0,eem2015ha_charge_0,eem2015hm_charge_0,eem2015hn_charge_0,eem2015ba_charge_0,eem2015bm_charge_0,eem2015bn_charge_0,eem_charge_1,mmff94_charge_1,gasteiger_charge_1,qeq_charge_1,qtpie_charge_1,eem2015ha_charge_1,eem2015hm_charge_1,eem2015hn_charge_1,eem2015ba_charge_1,eem2015bm_charge_1,eem2015bn_charge_1,dist_x,dist_x.1,dist_y,dist_z,type_0,Angle,dot_product_coordinates,euclidean_length,molecule_couples,molecule_dist_mean,molecule_dist_min,molecule_dist_max,molecule_dist_std,atom_0_couples_count,atom_1_couples_count,molecule_angle_mean,molecule_angle_min,molecule_angle_max,sum_bond_length_by_id,EN_sum_by_id,molecule_atom_index_0_x_1_std,molecule_atom_index_0_y_1_mean,molecule_atom_index_0_y_1_mean_diff,molecule_atom_index_0_y_1_mean_div,molecule_atom_index_0_y_1_max,molecule_atom_index_0_y_1_max_diff,molecule_atom_index_0_y_1_std,molecule_atom_index_0_z_1_std,molecule_atom_index_0_dist_mean,molecule_atom_index_0_dist_mean_diff,molecule_atom_index_0_dist_mean_div,molecule_atom_index_0_dist_max,molecule_atom_index_0_dist_max_diff,molecule_atom_index_0_dist_max_div,molecule_atom_index_0_dist_min,molecule_atom_index_0_dist_min_diff,molecule_atom_index_0_dist_min_div,molecule_atom_index_0_dist_std,molecule_atom_index_0_dist_std_diff,molecule_atom_index_0_dist_std_div,molecule_atom_index_1_dist_mean,molecule_atom_index_1_dist_mean_diff,molecule_atom_index_1_dist_mean_div,molecule_atom_index_1_dist_max,molecule_atom_index_1_dist_max_diff,molecule_atom_index_1_dist_max_div,molecule_atom_index_1_dist_min,molecule_atom_index_1_dist_min_diff,molecule_atom_index_1_dist_min_div,molecule_atom_index_1_dist_std,molecule_atom_index_1_dist_std_diff,molecule_atom_index_1_dist_std_div,molecule_atom_1_dist_mean,molecule_atom_1_dist_min,molecule_atom_1_dist_min_diff,molecule_atom_1_dist_min_div,molecule_atom_1_dist_std,molecule_atom_1_dist_std_diff,molecule_type_0_dist_std,molecule_type_0_dist_std_diff,molecule_type_dist_mean,molecule_type_dist_mean_diff,molecule_type_dist_mean_div,molecule_type_dist_max,molecule_type_dist_min,molecule_type_dist_std,molecule_type_dist_std_diff,number_of_bonds_molecule_type,atom_1_couples_type_count,atom_0_couples_type_count,C,H,N,atom_count,sum_electronegat_by_molecule,atom_index_closest_0,dist_y.1,x_closest_0,y_closest_0,z_closest_0,atom_index_closest_1,dist,x_closest_1,y_closest_1,z_closest_1,distance_0,distance_1,cos_0_1,cos_0,cos_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.808,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.002,-0.006,0.002,2.2,0.43,1,1.092,0.0,-0.0,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.0,1.192,0.0,0,2.694,-0.007,0.007,10,1.507,1.092,1.783,0.912,4,4,1.509,0.617,2.924,2.184,4.75,0.728,1.359,0.273,1.251,1.464,0.378,0.182,0.728,1.61,0.518,1.475,1.783,0.691,1.633,1.092,0.0,1.0,0.346,-0.746,0.316,1.092,-0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,-0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
1,4,dsgdb9nsd_000001,2,0,1JHC,84.807,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.012,1.464,0.0,2.2,0.43,1,1.092,0.0,-1.517,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,1.049,0.143,0.0,0,0.617,1.577,1.779,10,1.507,1.092,1.783,0.912,3,4,1.509,0.617,2.924,2.184,4.75,0.3,1.324,0.238,1.219,1.448,0.362,0.206,0.892,1.553,0.461,1.422,1.783,0.691,1.633,1.092,0.0,1.0,0.399,-0.693,0.365,1.092,-0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,-0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
2,7,dsgdb9nsd_000001,3,0,1JHC,84.809,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.541,1.448,-0.877,2.2,0.43,1,1.092,0.0,-0.504,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.279,0.131,0.783,0,0.619,1.572,1.777,10,1.507,1.092,1.783,0.912,2,4,1.509,0.617,2.924,2.184,4.75,0.361,1.262,0.176,1.162,1.438,0.352,0.249,0.635,1.438,0.346,1.316,1.783,0.691,1.633,1.092,0.0,1.0,0.489,-0.603,0.448,1.092,0.0,1.0,1.092,0.0,1.0,1.092,0.0,1.0,0.0,-1.092,0.0,1.092,1.092,0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,-1.0,-1.0,1.0
3,9,dsgdb9nsd_000001,4,0,1JHC,84.809,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,3,12.011,-0.078,4,3,1.092,1.911,1.911,,,,,,,,,,,1.911,,,,,,,,,,,,,,,6,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.783,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.524,1.438,0.906,2.2,0.43,1,1.092,0.0,2.145,0,-0.013,1.086,0.008,2.55,0.82,4,1.092,0.0,0.031,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,-0.645,0.0,-0.078,3.251,-3.094,0.015,-0.813,-0.785,-0.067,-0.806,-0.851,1.092,0.261,0.124,0.807,0,0.617,1.575,1.779,10,1.507,1.092,1.783,0.912,1,4,1.509,0.617,2.924,2.184,4.75,,1.086,0.0,1.0,1.086,0.0,,,1.092,0.0,1.0,1.092,0.0,1.0,1.092,0.0,1.0,,,,1.092,0.0,1.0,1.092,0.0,1.0,1.092,-0.0,1.0,0.0,-1.092,0.0,1.092,1.092,-0.0,1.0,0.0,-1.092,0.0,-1.092,1.092,0.0,1.0,1.092,1.092,0.0,-1.092,4,4,1,1.0,4.0,0.0,5,11.35,0,1.092,-0.013,1.086,0.008,3,1.092,-0.541,1.448,-0.877,1.092,1.092,0.333,-1.0,-0.333
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,-0.013,1.086,0.008,3.963,3.43,-3.4,-20.579,-3.4,-20.579,0,1.008,0.019,1,0,1.008,0.019,1,6,1.783,0.616,1.047,,,,,,,,,,,1.047,,,,,,,,,,,,,,,3,7,22,21,21,21,19,19,19,18,14,12,7,10,9,5,4,3,2,1,22,22,22,22,22,22,22,1.092,1.783,,,,,,,,,,,1.783,,,,,,,,,,,,,,,5,3,0.0,16.031,0,1,16.042,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.002,-0.006,0.002,2.2,0.43,1,1.092,0.0,-0.0,1,1.012,1.464,0.0,2.2,0.43,1,1.092,0.0,-1.517,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,0.161,0.0,0.019,-0.813,0.773,-0.004,0.203,0.196,0.017,0.202,0.213,1.783,1.019,2.16,0.0,1,2.163,-0.007,0.007,10,1.507,1.092,1.783,0.912,4,1,1.509,0.617,2.924,2.184,4.4,0.728,1.359,-0.105,0.928,1.464,0.0,0.182,0.728,1.61,-0.173,0.903,1.783,0.0,1.0,1.092,-0.691,0.612,0.346,-1.438,0.194,1.783,0.0,1.0,1.783,0.0,1.0,1.783,0.0,1.0,,,,1.783,1.783,0.0,1.0,0.0,-1.783,0.0,-1.783,1.783,0.0,1.0,1.783,1.783,0.0,-1.783,6,1,3,0.0,4.0,0.0,4,8.8,0,1.092,-0.013,1.086,0.008,0,1.092,-0.013,1.086,0.008,1.092,1.092,-0.333,-1.333,1.333


In [None]:
gridParams = {
    'max_depth': [5,7,9,12,15]
    }

In [None]:
def show_varimp(fit_model, mol_type):
        feature_imp = pd.DataFrame(sorted(zip(fit_model.feature_importances_,X.columns)), columns=['Value','Feature'])
        #plt.figure(figsize=(20, 10))
        #sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
        #plt.title(f'LightGBM Features of {mol_type} Model')
        #plt.tight_layout()
        #plt.show()
        #plt.savefig(f'varImp/lgbm_importances_{mol_type}.png')
        varimps = pd.DataFrame(feature_imp.sort_values(by="Value", ascending=False))
        varimps.to_csv(f'varImp/lgbm_importances_{mol_type}.csv', index = False)

In [None]:
gc.collect()

121

In [None]:
# Rerun the whole model with DSO as an input variable and coupling scalar as target 
dict_score = dict()
submission_list = list()

def train_model(df_train, df_test)
for moltype in df_train['type'].unique():
    varImp = pd.read_csv(f'varImp/lgbm_importances_{moltype}.csv')
    varImp = varImp[varImp.Value >= 200]
    varImpCount = varImp["Feature"].nunique() 
    varImpUnique = list(varImp['Feature'].unique())
    varImpUnique.extend(['id', 'molecule_name', 'scalar_coupling_constant', 'type'])
    
    df_train_type = df_train[df_train.type == moltype].reset_index(drop = True)
    df_train_type = df_train_type[varImpUnique]
    
    varImpUnique.remove('scalar_coupling_constant')
    
    df_test_type = df_test[df_test.type == moltype].reset_index(drop = True)
    df_test_type = df_test_type[varImpUnique]
    
    print(f'{varImpCount} features have been chosen for modeling of the {moltype} type')
    
    df_train_type = df_train_type.drop(['type'],  axis=1)
    df_test_type = df_test_type.drop(['type'],  axis=1)
    X = df_train_type.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
    Y = df_train_type['scalar_coupling_constant']
    X_test = df_test_type.drop(['id', 'molecule_name'], axis=1)
    print(moltype)
    #print(f'Hyper parameter optimization for {moltype} starts...')
    
    #mdl = lgb.LGBMRegressor(**params)
    
    #grid = GridSearchCV(mdl, gridParams,
    #                    verbose=50,
    #                    cv=2,
    #                    n_jobs=20)
    
    #grid.fit(X, Y)
    
    # Print the best parameters found
    #print(f' the best max_depth is {grid.best_params_["max_depth"]}')
    #print(f'the best score is {grid.best_score_}')
    
    #params['max_depth'] = grid.best_params_['max_depth']
          
    #print(f'Hyper parameter optimization for {moltype} is finished. Model tarining starts')
    
    #gc.collect()      
    
    scores = []
    for i, (train_index, test_index) in enumerate(kf.split(df_train_type)):

        # Create data for this fold
        Y_train, Y_valid = Y.iloc[train_index].copy(), Y.iloc[test_index].copy()
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        
        print( f'\nFold {moltype}: {i}')

        fit_model = lgb.LGBMRegressor(**params)
        fit_model.fit(X_train, Y_train)
        pred = fit_model.predict(X_valid)
        # Save validation predictions for this fold
        print( "Group Log MAE: ", math.log(mean_absolute_error(Y_valid, pred)))
        scores.append(math.log(mean_absolute_error(Y_valid, pred)))

    print(f'The mean score of a model for {moltype} is: {np.mean(scores)}')
    dict_score[moltype] = np.mean(scores)
    
    submit_pred = fit_model.predict(X_test)
    submit_pred= pd.DataFrame(submit_pred)
    submit_pred.columns = ['scalar_coupling_constant']
    sub = pd.concat([df_test_type['id'], submit_pred], axis = 1)
    submission_list.append(sub)
    # Show variable importance of a model
    #show_varimp(fit_model = fit_model, mol_type = moltype)
    
print("Overall mean is ", np.array(list(dict_score.values())).mean())
    

220 features have been chosen for modeling of the 1JHC type
1JHC

Fold 1JHC: 0
Group Log MAE:  -0.5130448945212333

Fold 1JHC: 1
Group Log MAE:  -0.5066473836815847

Fold 1JHC: 2
Group Log MAE:  -0.5189577702001786

Fold 1JHC: 3
Group Log MAE:  -0.5122940006756367

Fold 1JHC: 4
Group Log MAE:  -0.5113266587988433
The mean score of a model for 1JHC is: -0.5124541415754953
213 features have been chosen for modeling of the 2JHH type
2JHH

Fold 2JHH: 0
Group Log MAE:  -1.826460103532638

Fold 2JHH: 1
Group Log MAE:  -1.831132368291021

Fold 2JHH: 2
Group Log MAE:  -1.8320014159605313

Fold 2JHH: 3
Group Log MAE:  -1.8298425790623363

Fold 2JHH: 4
Group Log MAE:  -1.8249640480187286
The mean score of a model for 2JHH is: -1.828880102973051
201 features have been chosen for modeling of the 1JHN type
1JHN

Fold 1JHN: 0
Group Log MAE:  -1.0509839503378602

Fold 1JHN: 1
Group Log MAE:  -1.0518837472783866

Fold 1JHN: 2
Group Log MAE:  -1.055226030649684

Fold 1JHN: 3
Group Log MAE:  -1.03356031

In [74]:
dict_score

{'1JHC': -0.5124541415754953,
 '2JHH': -1.828880102973051,
 '1JHN': -1.0429189011136328,
 '2JHN': -1.7638013584110876,
 '2JHC': -1.3882315521234436,
 '3JHH': -1.8149820033348436,
 '3JHC': -1.3027200749253003,
 '3JHN': -2.0609208686153115}

In [42]:
dict_score

{'1JHC': -0.4309591201915368,
 '2JHH': -1.8240186557508715,
 '1JHN': -1.039254862118396,
 '2JHN': -1.7525217826891395,
 '2JHC': -1.3097040318802349,
 '3JHH': -1.7849837970793296,
 '3JHC': -1.1945444313787867,
 '3JHN': -2.061359694288512}

In [None]:
final_submit = pd.concat(submission_list)

In [None]:
final_submit.head()

In [75]:
print("Overall mean is ", np.array(list(dict_score.values())).mean())

Overall mean is  -1.4643636253840207


In [76]:
final_submit.to_csv('submission_model_by_type.csv', index = False)

In [130]:
!kaggle competitions submit -c champs-scalar-coupling -f submission_model_by_type.csv -m "predicted sd dso pso"

100%|██████████████████████████████████████| 64.0M/64.0M [00:05<00:00, 12.3MB/s]
Successfully submitted to Predicting Molecular Properties