In [1]:
import pandas as pd
import numpy as np
import gc

gc.enable()

DATA = '~/Data/Molecular'

data = pd.concat([
    pd.read_pickle(f"{DATA}/basic.gz"),
    pd.read_pickle(f"{DATA}/angle_feature.gz"),
    pd.read_pickle(f"{DATA}/criskiev_distance_feature.gz"),
    pd.read_pickle(f"{DATA}/qm9.gz")
], axis = 1)
data = data.iloc[:4658147, :]
data['random_feature'] = np.require(np.random.rand(data.shape[0]), 'float32')
y = pd.read_csv(f"{DATA}/train.csv", dtype={
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}, usecols=['scalar_coupling_constant'])

In [2]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

folds = KFold(n_splits=5, random_state=49, shuffle=True)
drop_feature_loss = dict()
for bond in pd.unique(data['type']):
    X = data[data['type'] == bond]
    yt = y[data['type'] == bond]
    print(f"For bond type {bond}:")
    for i, feature_name in enumerate(data.columns[13:]):
        feature_index = [j for j,itm in enumerate(data.columns) if j > 13 and j != 13+i]
        loss_ = []
        for it, iv in folds.split(X, y = yt['scalar_coupling_constant']):
            lr = LinearRegression()
            lr.fit(X.iloc[it, feature_index], yt.iloc[it, 0])
            yv_ = lr.predict(X.iloc[iv, feature_index])
            loss_.append(mean_absolute_error(yt.iloc[iv, 0].values, yv_))
        drop_feature_loss[feature_name] = np.log(np.mean(loss_))
        print(f"{feature_name},{drop_feature_loss[feature_name]}")


For bond type 1JHC:
dist_x,1.151504397392273
dist_y,1.096468210220337
dist_z,1.1936919689178467
dist,1.1554853916168213
dist_x_molecule_mean,1.1426970958709717
dist_x_molecule_std,1.1297916173934937
dist_x_molecule_min,1.1107902526855469
dist_x_molecule_max,1.1262530088424683
dist_x_molecule_median,1.1004325151443481
dist_x_molecule_skew,1.1268310546875
dist_y_molecule_mean,1.2002891302108765
dist_y_molecule_std,1.1287105083465576
dist_y_molecule_min,1.1299512386322021
dist_y_molecule_max,1.1140023469924927
dist_y_molecule_median,1.1365597248077393
dist_y_molecule_skew,1.1075795888900757
dist_z_molecule_mean,1.1222314834594727
dist_z_molecule_std,1.1167654991149902
dist_z_molecule_min,1.1275501251220703
dist_z_molecule_max,1.1278759241104126
dist_z_molecule_median,1.1079131364822388
dist_z_molecule_skew,1.1227000951766968
dist_molecule_mean,1.1192080974578857
dist_molecule_std,1.122534990310669
dist_molecule_min,1.142053246498108
dist_molecule_max,1.1307653188705444
dist_molecule_media

dist_n_molecule_type_max_0,1.1079304218292236
dist_n_molecule_type_median_0,1.1163333654403687
dist_n_molecule_type_skew_0,1.1133430004119873
dist_f_molecule_type_mean_0,1.107174277305603
dist_f_molecule_type_std_0,1.1234931945800781
dist_f_molecule_type_min_0,1.11277174949646
dist_f_molecule_type_max_0,1.1237924098968506
dist_f_molecule_type_median_0,1.1298761367797852
dist_f_molecule_type_skew_0,1.1057480573654175
dist_n_molecule_type_mean_1,1.141994595527649
dist_n_molecule_type_std_1,1.118597149848938
dist_n_molecule_type_min_1,1.1273425817489624
dist_n_molecule_type_max_1,1.1304187774658203
dist_n_molecule_type_median_1,1.1124695539474487
dist_n_molecule_type_skew_1,1.1388742923736572
dist_f_molecule_type_mean_1,1.1290061473846436
dist_f_molecule_type_std_1,1.1349605321884155
dist_f_molecule_type_min_1,1.14154052734375
dist_f_molecule_type_max_1,1.156298041343689
dist_f_molecule_type_median_1,1.1352907419204712
dist_f_molecule_type_skew_1,1.1252777576446533
atom_2,1.14925515651702

neighbour_1JHC_1,-0.2795136868953705
neighbour_1JHN_1,-0.2960811257362366
neighbour_2JHC_1,-0.2648429870605469
neighbour_2JHH_1,-0.3154607117176056
neighbour_2JHN_1,-0.29324257373809814
neighbour_3JHC_1,-0.3444240689277649
neighbour_3JHH_1,-0.2896197438240051
neighbour_3JHN_1,-0.2707207500934601
neighbour_C_1,-0.29431289434432983
neighbour_H_1,-0.2962050437927246
neighbour_N_1,-0.16888625919818878
dist_0_n,-0.2758389413356781
dist_1_n,-0.15080644190311432
dist_0_f,-0.30723848938941956
dist_1_f,-0.3093654215335846
dist_0_c,-0.26125919818878174
dist_1_c,-0.3244335353374481
cos_a0n0_a1n1,-0.2932579815387726
cos_a0f0_a1f1,-0.32379165291786194
cos_a0c_a1c,-0.2447207272052765
cos_a0n0_a0a1,-0.325780987739563
cos_a1n1_a0a1,-0.308448851108551
cos_a0f0_a0a1,-0.23582002520561218
cos_a1f1_a0a1,-0.3284968137741089
cos_a0_n_a1_molecule_atom_mean_0,-0.12616226077079773
cos_a0_n_a1_molecule_atom_std_0,-0.3004925549030304
cos_a0_n_a1_molecule_atom_min_0,-0.2764778137207031
cos_a0_n_a1_molecule_atom_ma

neighbour_std_dist_x_molecule_atom_0,0.03379134088754654
neighbour_min_dist_x_molecule_atom_0,0.034968484193086624
neighbour_max_dist_x_molecule_atom_0,0.03371826931834221
neighbour_median_dist_x_molecule_atom_0,0.03436499834060669
neighbour_skew_dist_x_molecule_atom_0,0.03439275547862053
neighbour_mean_dist_y_molecule_atom_0,0.03421398252248764
neighbour_std_dist_y_molecule_atom_0,0.03459429740905762
neighbour_min_dist_y_molecule_atom_0,0.03388676047325134
neighbour_max_dist_y_molecule_atom_0,0.03520236909389496
neighbour_median_dist_y_molecule_atom_0,0.03535196930170059
neighbour_skew_dist_y_molecule_atom_0,0.03428379073739052
neighbour_mean_dist_z_molecule_atom_0,0.03411306068301201
neighbour_std_dist_z_molecule_atom_0,0.035262901335954666
neighbour_min_dist_z_molecule_atom_0,0.03393469750881195
neighbour_max_dist_z_molecule_atom_0,0.03479798883199692
neighbour_median_dist_z_molecule_atom_0,0.03371838480234146
neighbour_skew_dist_z_molecule_atom_0,0.034716010093688965
neighbour_1JHC

dist_x_molecule_std,-0.32057493925094604
dist_x_molecule_min,-0.32057639956474304
dist_x_molecule_max,-0.32002702355384827
dist_x_molecule_median,-0.3203795552253723
dist_x_molecule_skew,-0.2862702012062073
dist_y_molecule_mean,-0.32009515166282654
dist_y_molecule_std,-0.32033661007881165
dist_y_molecule_min,-0.32077181339263916
dist_y_molecule_max,-0.32035163044929504
dist_y_molecule_median,-0.3206256031990051
dist_y_molecule_skew,-0.32085806131362915
dist_z_molecule_mean,-0.2923183739185333
dist_z_molecule_std,-0.32056352496147156
dist_z_molecule_min,-0.3208169937133789
dist_z_molecule_max,-0.320548415184021
dist_z_molecule_median,-0.3205786347389221
dist_z_molecule_skew,-0.3204612731933594
dist_molecule_mean,-0.31963014602661133
dist_molecule_std,-0.3211100697517395
dist_molecule_min,-0.3195952773094177
dist_molecule_max,-0.28787440061569214
dist_molecule_median,-0.3203403949737549
dist_molecule_skew,-0.3203169107437134
dist_x_molecule_type_mean,-0.32097193598747253
dist_x_molecule_

dist_n_molecule_type_mean_0,-0.32066553831100464
dist_n_molecule_type_std_0,-0.25922656059265137
dist_n_molecule_type_min_0,-0.3195846676826477
dist_n_molecule_type_max_0,-0.28609055280685425
dist_n_molecule_type_median_0,-0.29161152243614197
dist_n_molecule_type_skew_0,-0.32019349932670593
dist_f_molecule_type_mean_0,-0.3206081986427307
dist_f_molecule_type_std_0,-0.2919981777667999
dist_f_molecule_type_min_0,-0.32079949975013733
dist_f_molecule_type_max_0,-0.28818026185035706
dist_f_molecule_type_median_0,-0.2924294173717499
dist_f_molecule_type_skew_0,-0.29059088230133057
dist_n_molecule_type_mean_1,-0.2959994673728943
dist_n_molecule_type_std_1,-0.31993138790130615
dist_n_molecule_type_min_1,-0.2892538607120514
dist_n_molecule_type_max_1,-0.3207363486289978
dist_n_molecule_type_median_1,-0.2902778387069702
dist_n_molecule_type_skew_1,-0.28741198778152466
dist_f_molecule_type_mean_1,-0.26201385259628296
dist_f_molecule_type_std_1,-0.2905118763446808
dist_f_molecule_type_min_1,-0.253

neighbour_skew_dist_z_molecule_atom_1,0.5526522397994995
neighbour_1JHC_1,0.5528647899627686
neighbour_1JHN_1,0.5601580739021301
neighbour_2JHC_1,0.5497658252716064
neighbour_2JHH_1,0.5600061416625977
neighbour_2JHN_1,0.5600061416625977
neighbour_3JHC_1,0.557036280632019
neighbour_3JHH_1,0.5608689785003662
neighbour_3JHN_1,0.5608689785003662
neighbour_C_1,0.5608689785003662
neighbour_H_1,0.5608687996864319
neighbour_N_1,0.5608687400817871
dist_0_n,0.5652281045913696
dist_1_n,0.5744690299034119
dist_0_f,0.5401108264923096
dist_1_f,0.5597438812255859
dist_0_c,0.549900472164154
dist_1_c,0.5599623322486877
cos_a0n0_a1n1,0.5680156946182251
cos_a0f0_a1f1,0.5499486923217773
cos_a0c_a1c,0.5639840960502625
cos_a0n0_a0a1,0.5422800779342651
cos_a1n1_a0a1,0.5529385209083557
cos_a0f0_a0a1,0.5466441512107849
cos_a1f1_a0a1,0.5711655616760254
cos_a0_n_a1_molecule_atom_mean_0,0.5528334379196167
cos_a0_n_a1_molecule_atom_std_0,0.5731613039970398
cos_a0_n_a1_molecule_atom_min_0,0.576377272605896
cos_a0_n

neighbour_median_dist_x_molecule_atom_0,0.2489905208349228
neighbour_skew_dist_x_molecule_atom_0,0.2483595758676529
neighbour_mean_dist_y_molecule_atom_0,0.24624016880989075
neighbour_std_dist_y_molecule_atom_0,0.24790611863136292
neighbour_min_dist_y_molecule_atom_0,0.24779587984085083
neighbour_max_dist_y_molecule_atom_0,0.246006041765213
neighbour_median_dist_y_molecule_atom_0,0.24615460634231567
neighbour_skew_dist_y_molecule_atom_0,0.25116652250289917
neighbour_mean_dist_z_molecule_atom_0,0.2456182986497879
neighbour_std_dist_z_molecule_atom_0,0.251769095659256
neighbour_min_dist_z_molecule_atom_0,0.24862167239189148
neighbour_max_dist_z_molecule_atom_0,0.24912990629673004
neighbour_median_dist_z_molecule_atom_0,0.24803952872753143
neighbour_skew_dist_z_molecule_atom_0,0.24865198135375977
neighbour_1JHC_0,0.24715691804885864
neighbour_1JHN_0,0.24879616498947144
neighbour_2JHC_0,0.24789142608642578
neighbour_2JHH_0,0.24925357103347778
neighbour_2JHN_0,0.2503568232059479
neighbour_3

dist_y_molecule_min,0.5797197818756104
dist_y_molecule_max,0.5797381401062012
dist_y_molecule_median,0.579852283000946
dist_y_molecule_skew,0.580200731754303
dist_z_molecule_mean,0.5795157551765442
dist_z_molecule_std,0.5801951885223389
dist_z_molecule_min,0.5797117352485657
dist_z_molecule_max,0.5802859663963318
dist_z_molecule_median,0.5798941254615784
dist_z_molecule_skew,0.5803318619728088
dist_molecule_mean,0.5789856910705566
dist_molecule_std,0.578941822052002
dist_molecule_min,0.5798342823982239
dist_molecule_max,0.5798183083534241
dist_molecule_median,0.5797566175460815
dist_molecule_skew,0.5800346732139587
dist_x_molecule_type_mean,0.5795612931251526
dist_x_molecule_type_std,0.5797213315963745
dist_x_molecule_type_min,0.5797285437583923
dist_x_molecule_type_max,0.5798467397689819
dist_x_molecule_type_median,0.5800721645355225
dist_x_molecule_type_skew,0.5800210237503052
dist_y_molecule_type_mean,0.578660249710083
dist_y_molecule_type_std,0.5805825591087341
dist_y_molecule_type

dist_n_molecule_type_mean_1,0.58024662733078
dist_n_molecule_type_std_1,0.5800355672836304
dist_n_molecule_type_min_1,0.5801516175270081
dist_n_molecule_type_max_1,0.5796818137168884
dist_n_molecule_type_median_1,0.5798095464706421
dist_n_molecule_type_skew_1,0.5796120762825012
dist_f_molecule_type_mean_1,0.5799424648284912
dist_f_molecule_type_std_1,0.5798264145851135
dist_f_molecule_type_min_1,0.5798132419586182
dist_f_molecule_type_max_1,0.5797844529151917
dist_f_molecule_type_median_1,0.5802820920944214
dist_f_molecule_type_skew_1,0.5798501372337341
atom_2,0.5801888108253479
atom_3,0.5820159912109375
atom_4,0.5845019221305847
atom_5,0.5813266634941101
atom_6,0.5811700820922852
atom_7,0.5796552896499634
atom_8,0.5811318755149841
atom_9,0.581213116645813
d_1_0,0.5793377757072449
d_2_0,0.6201657056808472
d_2_1,0.623640775680542
d_3_0,0.5915808081626892
d_3_1,0.5917096138000488
d_3_2,0.5805969834327698
d_4_0,0.5841837525367737
d_4_1,0.5856744647026062
d_4_2,0.5805000066757202
d_4_3,0.5

dist_0_n,-0.3651091456413269
dist_1_n,-0.3651978671550751
dist_0_f,-0.36518532037734985
dist_1_f,-0.36516693234443665
dist_0_c,-0.3646996319293976
dist_1_c,-0.3625100553035736
cos_a0n0_a1n1,-0.36487269401550293
cos_a0f0_a1f1,-0.36495932936668396
cos_a0c_a1c,-0.3636888265609741
cos_a0n0_a0a1,-0.36325323581695557
cos_a1n1_a0a1,-0.35351598262786865
cos_a0f0_a0a1,-0.3647586703300476
cos_a1f1_a0a1,-0.3651666045188904
cos_a0_n_a1_molecule_atom_mean_0,-0.36444157361984253
cos_a0_n_a1_molecule_atom_std_0,-0.3641902506351471
cos_a0_n_a1_molecule_atom_min_0,-0.36419743299484253
cos_a0_n_a1_molecule_atom_max_0,-0.3651825785636902
cos_a0_n_a1_molecule_atom_median_0,-0.3655088543891907
cos_a0_n_a1_molecule_atom_skew_0,-0.3642461895942688
cos_a0_f_a1_molecule_atom_mean_0,-0.3649424910545349
cos_a0_f_a1_molecule_atom_std_0,-0.36119791865348816
cos_a0_f_a1_molecule_atom_min_0,-0.3646569848060608
cos_a0_f_a1_molecule_atom_max_0,-0.3651849627494812
cos_a0_f_a1_molecule_atom_median_0,-0.365252822637558
c