In [1]:
import pandas as pd
import numpy as np
import pickle

In [4]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
structures = pd.read_csv('../Data/structures.csv')
train_bonds = pd.read_csv('../Data/train_bonds.csv')
test_bonds = pd.read_csv('../Data/test_bonds.csv')

In [14]:
test_bonds = test_bonds.drop('Unnamed: 0',1)
train_bonds = train_bonds.drop('Unnamed: 0',1)

In [12]:
train.head(10)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095


In [7]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [11]:
train_bonds.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH
4,dsgdb9nsd_000002,0,1,1.0,1.01719,0,1.0HN


In [15]:
test_bonds.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type
0,dsgdb9nsd_000004,1,2,1.0,1.062099,0,1.0CH
1,dsgdb9nsd_000004,0,3,1.0,1.062099,0,1.0CH
2,dsgdb9nsd_000004,0,1,3.0,1.199079,0,3.0CC
3,dsgdb9nsd_000015,0,3,1.0,1.102328,0,1.0CH
4,dsgdb9nsd_000015,0,4,1.0,1.102327,0,1.0CH


### node information

In [24]:
structures[['C', 'F', 'H', 'N', 'O']] = pd.get_dummies(structures.atom)
structures = structures.sort_values(by=['molecule_name', 'atom_index'])
structures_gb = structures.groupby(['molecule_name'])
structures_gb = {k:v[['x','y','z','C', 'F', 'H', 'N', 'O']].values.astype(np.float32) for k,v in structures_gb}

### bond information

In [None]:
assert set(train_bonds.bond_type) == set(test_bonds.bond_type)

In [48]:
bonds = train_bonds.append(test_bonds, ignore_index=True)

In [49]:
del train_bonds,test_bonds

In [52]:
bonds[['1.0CC',
 '1.0CF',
 '1.0CH',
 '1.0CN',
 '1.0CO',
 '1.0HN',
 '1.0HO',
 '1.0NN',
 '1.0NO',
 '1.5CO',
 '2.0CC',
 '2.0CN',
 '2.0CO',
 '2.0NN',
 '2.0NO',
 '3.0CC',
 '3.0CN']] = pd.get_dummies(bonds.bond_type)

In [55]:
bonds.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,1.0CC,1.0CF,1.0CH,...,1.0NN,1.0NO,1.5CO,2.0CC,2.0CN,2.0CO,2.0NN,2.0NO,3.0CC,3.0CN
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,dsgdb9nsd_000002,0,1,1.0,1.01719,0,1.0HN,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
bonds_gb = bonds.groupby(['molecule_name'])

In [87]:
bonds_edge_index = {}
bonds_edge_attr = {}
for k,v in bonds_gb:
    bonds_edge_index[k] = np.concatenate([v[['atom_index_0','atom_index_1']].values,\
                                          v[['atom_index_1','atom_index_0']].values]).T
    bonds_edge_attr[k] = np.tile(v[['L2dist', 'error',\
                                   '1.0CC', '1.0CF', '1.0CH', '1.0CN', '1.0CO','1.0HN',\
                                   '1.0HO', '1.0NN', '1.0NO', '1.5CO', '2.0CC', '2.0CN',\
                                   '2.0CO', '2.0NN', '2.0NO', '3.0CC', '3.0CN']].values.astype(np.float32),(2,1))

### coupling information

In [92]:
assert set(train.type) == set(test.type)

In [93]:
test['scalar_coupling_constant'] = np.nan

In [95]:
coupling = train.append(test, ignore_index=True)

In [98]:
coupling.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [100]:
coupling[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']]=pd.get_dummies(coupling.type)

In [102]:
coupling = coupling.groupby(['molecule_name'])

In [116]:
coupling_edge_index = {}
coupling_edge_attr = {}
coupling_y = {}
for k,v in coupling:
    coupling_edge_index[k] = np.concatenate([v[['atom_index_0','atom_index_1']].values,\
                                          v[['atom_index_1','atom_index_0']].values]).T
    coupling_edge_attr[k] = np.tile(v[['1JHC','1JHN','2JHC','2JHH','2JHN','3JHC','3JHH','3JHN']].values.astype(np.float32),(2,1))
    if not np.any(np.isnan(v.scalar_coupling_constant.values)):
        coupling_y[k] = v.scalar_coupling_constant.values.astype(np.float32)

In [127]:
train_mol = np.unique(train.molecule_name)
test_mol = np.unique(test.molecule_name)

In [131]:
train_mol = np.random.permutation(train_mol)

In [133]:
train_mol2 = train_mol[:70000]
val_mol = train_mol[70000:]
train_mol = train_mol2

In [138]:
train_data = [{'x':structures_gb[m],'edge_index':bonds_edge_index[m],\
               'edge_attr':bonds_edge_attr[m],'y':coupling_y[m],\
               'edge_index2':coupling_edge_index[m],'edge_attr2':coupling_edge_attr[m]}
              for m in train_mol]

In [142]:
val_data = [{'x':structures_gb[m],'edge_index':bonds_edge_index[m],\
               'edge_attr':bonds_edge_attr[m],'y':coupling_y[m],\
               'edge_index2':coupling_edge_index[m],'edge_attr2':coupling_edge_attr[m]}
              for m in val_mol]

In [143]:
test_data = [{'x':structures_gb[m],'edge_index':bonds_edge_index[m],\
               'edge_attr':bonds_edge_attr[m],'mol':m,\
               'edge_index2':coupling_edge_index[m],'edge_attr2':coupling_edge_attr[m]}
              for m in test_mol]

In [None]:
# convert numpy array to torch array
train_data = [{k:torch.tensor(i[k]) for k in i.keys()} for i in train_data]
val_data = [{k:torch.tensor(i[k]) for k in i.keys()} for i in val_data]
test_data = [{k:torch.tensor(i[k]) if k!='mol' else i[k] for k in i.keys()} for i in test_data]

In [146]:
with open('../Data/train_data.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/val_data.pickle', 'wb') as handle:
    pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/test_data.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### CLEAN-UP: add only first half of coupling_edge information to calculate loss

In [2]:
with open('../Data/train_data.pickle', 'rb') as handle:
    train_data = pickle.load(handle)
with open('../Data/val_data.pickle', 'rb') as handle:
    val_data = pickle.load(handle)
with open('../Data/test_data.pickle', 'rb') as handle:
    test_data = pickle.load(handle)

In [26]:
def clean_up(listOfDict):
    for dict_ in listOfDict:
        n = dict_['edge_index2'].shape[1]//2
        dict_['edge_index3'] = dict_['edge_index2'][:,:n]
        dict_['edge_attr3'] = dict_['edge_attr2'][:n]

In [27]:
clean_up(train_data)
clean_up(val_data)
clean_up(test_data)

In [31]:
with open('../Data/train_data.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/val_data.pickle', 'wb') as handle:
    pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/test_data.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### data without coupling edge information

In [18]:
with open('../Data/train_data.pickle', 'rb') as handle:
    train_data = pickle.load(handle)
with open('../Data/val_data.pickle', 'rb') as handle:
    val_data = pickle.load(handle)
with open('../Data/test_data.pickle', 'rb') as handle:
    test_data = pickle.load(handle)

In [19]:
def deleteKey(listOfDict):
    for dict_ in listOfDict:
        del dict_['edge_index2'],dict_['edge_attr2']

In [None]:
def deleteKeyTest(listOfDict):
    for dict_ in listOfDict:
        del dict_['mol']

In [20]:
deleteKey(train_data)
deleteKey(val_data)
deleteKey(test_data)

In [None]:
deleteKeyTest(test_data)

In [26]:
with open('../Data/train_data3.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/val_data3.pickle', 'wb') as handle:
    pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/test_data3.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)