In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from rdkit.Chem import MolFromMolBlock, Descriptors, Descriptors3D
import feather

In [2]:
train = pd.read_csv("../data/input/train.csv")
test = pd.read_csv("../data/input/test.csv")
structures = pd.read_csv("../data/input/structures.csv")

In [3]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [4]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [6]:
#just use test for faster calculation

def map_atom_info(df, atom_idx):
        df = pd.merge(df, structures, how='left', left_on=['molecule_name', f'atom_index_{atom_idx}'],
                      right_on=['molecule_name', 'atom_index'])
        df = df.drop('atom_index', axis=1)
        df = df.rename(columns={'atom': f'atom_{atom_idx}',
                                                    'x': f'x_{atom_idx}',
                                                    'y': f'y_{atom_idx}',
                                                    'z': f'z_{atom_idx}'})
        return df
    
for i in [0, 1]:
    test = map_atom_info(test, i)

In [8]:
test.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,C,0.599539,0.0,1.0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.0,1.0,C,-0.599539,0.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.0,1.0,H,1.661639,0.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.0,1.0,C,0.599539,0.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,C,-0.599539,0.0,1.0
5,4658152,dsgdb9nsd_000015,3,0,1JHC,H,1.005284,1.810158,0.004656,C,-0.014821,1.392412,0.005671
6,4658153,dsgdb9nsd_000015,3,2,3JHC,H,1.005284,1.810158,0.004656,C,0.637949,-0.553297,-1.113582
7,4658154,dsgdb9nsd_000015,3,4,2JHH,H,1.005284,1.810158,0.004656,H,-0.546896,1.793435,-0.872511
8,4658155,dsgdb9nsd_000015,3,5,2JHH,H,1.005284,1.810158,0.004656,H,-0.530029,1.72292,0.911017
9,4658156,dsgdb9nsd_000015,4,0,1JHC,H,-0.546896,1.793435,-0.872511,C,-0.014821,1.392412,0.005671


In [17]:
test['type_order'] = test['type'].apply(lambda s: s[0])

In [10]:
#when order == 1, angle can't be defined

In [11]:
#when order == 2, torsion angle can be defined
#when order == 3, dihedral angle can be defined

In [18]:
test_2 = test[test['type_order'] == '2']
test_3 = test[test['type_order'] == '3']

In [19]:
test_2.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,type_order
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,C,0.599539,0.0,1.0,2
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,C,-0.599539,0.0,1.0,2
7,4658154,dsgdb9nsd_000015,3,4,2JHH,H,1.005284,1.810158,0.004656,H,-0.546896,1.793435,-0.872511,2
8,4658155,dsgdb9nsd_000015,3,5,2JHH,H,1.005284,1.810158,0.004656,H,-0.530029,1.72292,0.911017,2
11,4658158,dsgdb9nsd_000015,4,5,2JHH,H,-0.546896,1.793435,-0.872511,H,-0.530029,1.72292,0.911017,2


In [None]:
molecules = test['molecule_name'].unique()