In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd

#
import networkx as nx
from scipy.spatial.distance import pdist

In [2]:
! ls ../data/*.csv

../data/1_average_coupling.csv		   ../data/structures.csv
../data/dipole_moments.csv		   ../data/test_4_more_features.csv
../data/magnetic_shielding_tensors.csv	   ../data/test.csv
../data/mulliken_charges.csv		   ../data/test_tmp.csv
../data/potential_energy.csv		   ../data/train_4_more_features.csv
../data/sample_submission.csv		   ../data/train.csv
../data/scalar_coupling_contributions.csv  ../data/train_tmp.csv


In [3]:
from pathlib import Path
PATH = Path('../data')

In [4]:
dipole = pd.read_csv(PATH/'dipole_moments.csv')
ms_tensors = pd.read_csv(PATH/'magnetic_shielding_tensors.csv')
charges = pd.read_csv(PATH/'mulliken_charges.csv')
energy = pd.read_csv(PATH/'potential_energy.csv')
coupling = pd.read_csv(PATH/'scalar_coupling_contributions.csv')
structures = pd.read_csv(PATH/'structures.csv')

In [5]:
dipole.head()

Unnamed: 0,molecule_name,X,Y,Z
0,dsgdb9nsd_000001,0.0,0.0,0.0
1,dsgdb9nsd_000002,-0.0002,0.0,1.6256
2,dsgdb9nsd_000003,0.0,0.0,-1.8511
3,dsgdb9nsd_000005,0.0,0.0,-2.8937
4,dsgdb9nsd_000007,0.0,0.0,0.0


In [6]:
ms_tensors.head()

Unnamed: 0,molecule_name,atom_index,XX,YX,ZX,XY,YY,ZY,XZ,YZ,ZZ
0,dsgdb9nsd_000001,0,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
1,dsgdb9nsd_000001,1,31.341,-1.2317,4.0544,-1.2317,28.9546,-1.7173,4.0546,-1.7173,34.0861
2,dsgdb9nsd_000001,2,31.5814,1.2173,-4.1474,1.2173,28.9036,-1.6036,-4.1476,-1.6036,33.8967
3,dsgdb9nsd_000001,3,31.5172,4.1086,1.2723,4.1088,33.9068,1.695,1.2724,1.6951,28.9579
4,dsgdb9nsd_000001,4,31.4029,-4.0942,-1.1793,-4.0944,34.0776,1.6259,-1.1795,1.626,28.9013


In [7]:
charges.head()

Unnamed: 0,molecule_name,atom_index,mulliken_charge
0,dsgdb9nsd_000001,0,-0.535689
1,dsgdb9nsd_000001,1,0.133921
2,dsgdb9nsd_000001,2,0.133922
3,dsgdb9nsd_000001,3,0.133923
4,dsgdb9nsd_000001,4,0.133923


In [8]:
coupling.tail()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,fc,sd,pso,dso
4658142,dsgdb9nsd_133884,17,4,2JHC,3.58644,0.019741,0.150477,-0.213205
4658143,dsgdb9nsd_133884,17,5,3JHC,0.674583,-0.007276,0.305078,-0.403388
4658144,dsgdb9nsd_133884,17,6,3JHC,1.33747,-0.028423,0.31224,-0.44792
4658145,dsgdb9nsd_133884,17,7,2JHC,4.80062,0.139202,-0.053102,-0.12471
4658146,dsgdb9nsd_133884,17,8,1JHC,115.975,0.378277,0.450213,1.1306


In [9]:
coupling.type.unique()

array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'],
      dtype=object)

In [10]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [11]:
train = pd.read_csv(PATH/'train.csv')
test = pd.read_csv(PATH/'test.csv')

In [12]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [13]:
train.tail()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
4658142,4658142,dsgdb9nsd_133884,17,4,2JHC,3.54345
4658143,4658143,dsgdb9nsd_133884,17,5,3JHC,0.568997
4658144,4658144,dsgdb9nsd_133884,17,6,3JHC,1.17337
4658145,4658145,dsgdb9nsd_133884,17,7,2JHC,4.76201
4658146,4658146,dsgdb9nsd_133884,17,8,1JHC,117.934


In [14]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [15]:
len(test), len(train)

(2505542, 4658147)

In [16]:
len(structures)

2358657

Let's do something stupid and see how much it scores. I'll just average per-type coupling constant and see how much it scores.

In [21]:
coupling_types = coupling.type.unique()

In [22]:
avg_coupling = train.groupby(['type'])['scalar_coupling_constant'].mean()

In [23]:
avg_coupling

type
1JHC    94.976153
1JHN    47.479884
2JHC    -0.270624
2JHH   -10.286605
2JHN     3.124754
3JHC     3.688470
3JHH     4.771023
3JHN     0.990730
Name: scalar_coupling_constant, dtype: float64

In [24]:
# test performance on train

In [25]:
merged = train.copy()

In [26]:
merged['scalar_coupling_constant'] = [avg_coupling[t] for t in merged['type']]

In [27]:
merged['err'] = np.abs(merged.scalar_coupling_constant - train.scalar_coupling_constant)

In [28]:
merged.groupby('type').mean()

Unnamed: 0_level_0,id,atom_index_0,atom_index_1,scalar_coupling_constant,err
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1JHC,2346858.0,13.38873,3.355607,94.976153,12.784318
1JHN,2198234.0,12.92316,3.895948,47.479884,9.731954
2JHC,2298278.0,13.711514,3.513953,-0.270624,2.715677
2JHH,2438355.0,12.199814,13.396058,-10.286605,2.672629
2JHN,2252639.0,13.286626,4.117029,3.124754,2.964396
3JHC,2306329.0,13.68749,3.86834,3.68847,2.488767
3JHH,2392270.0,12.710813,14.721878,4.771023,3.072281
3JHN,2287087.0,12.874573,4.554409,0.99073,0.966748


In [29]:
# https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [30]:
group_mean_log_mae(merged['scalar_coupling_constant'], train['scalar_coupling_constant'], train['type'])

1.2366001783502671

In [58]:
# submission

In [None]:
test['scalar_coupling_constant']  = [avg_coupling[t] for t in test['type']]
test[['id', 'scalar_coupling_constant']].to_csv('submission.csv', index=False) #float_format='%.9f'

In [108]:
from IPython.display import FileLink
FileLink('1_average_coupling.csv')