In [1]:
import pandas as pd
import numpy as np
import pickle
import torch

In [2]:
#train = pd.read_csv('../Data/train.csv')
train = pd.read_csv('../Data/test.csv')
structures = pd.read_csv('../Data/structures.csv')

In [3]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [4]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

In [5]:
train

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.000000,1.000000,C,0.599539,0.000000,1.000000
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.000000,1.000000,C,-0.599539,0.000000,1.000000
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.000000,1.000000,H,1.661639,0.000000,1.000000
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.000000,1.000000,C,0.599539,0.000000,1.000000
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.000000,1.000000,C,-0.599539,0.000000,1.000000
5,4658152,dsgdb9nsd_000015,3,0,1JHC,H,1.005284,1.810158,0.004656,C,-0.014821,1.392412,0.005671
6,4658153,dsgdb9nsd_000015,3,2,3JHC,H,1.005284,1.810158,0.004656,C,0.637949,-0.553297,-1.113582
7,4658154,dsgdb9nsd_000015,3,4,2JHH,H,1.005284,1.810158,0.004656,H,-0.546896,1.793435,-0.872511
8,4658155,dsgdb9nsd_000015,3,5,2JHH,H,1.005284,1.810158,0.004656,H,-0.530029,1.722920,0.911017
9,4658156,dsgdb9nsd_000015,4,0,1JHC,H,-0.546896,1.793435,-0.872511,C,-0.014821,1.392412,0.005671


In [6]:
centers = (train.loc[:,['x_0','y_0','z_0']].values + train.loc[:,['x_1','y_1','z_1']].values)/2

In [7]:
vector = train.loc[:,['x_0','y_0','z_0']].values - train.loc[:,['x_1','y_1','z_1']].values

In [8]:
norm = np.linalg.norm(vector,axis=1)[:,None]
vector = vector/norm

In [9]:
structures[['C', 'F', 'H', 'N', 'O']] = pd.get_dummies(structures.atom)

In [10]:
structures_gb = structures.groupby(['molecule_name'])

In [11]:
structures_gb = {k:v.loc[:,['x','y','z','C', 'F', 'H', 'N', 'O']].values for k,v in structures_gb}

In [12]:
def create(mol,c,v):
    X = mol[:,:3] - c
    r = np.matmul(X,v)
    R = np.linalg.norm(X - r[:,None]*v,axis=1)
    out = np.concatenate([r[:,None],R[:,None],mol[:,3:],np.broadcast_to(d,mol.shape[0])[:,None]],1)
    return out

In [17]:
feature_list = []
for i in range(train.shape[0]):
    mol = structures_gb[train.iloc[i]['molecule_name']]
    c,v,d = centers[i],vector[i],norm[i]
    feature_list.append(create(mol,c,v))

In [18]:
def numpy2torch(x):
    return torch.from_numpy(x.astype(np.float32))

In [19]:
feature_list = [numpy2torch(i) for i in feature_list]

In [20]:
with open('../Data/test_data_attention_node.pickle', 'wb') as handle:
    pickle.dump(feature_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
edge_X = torch.from_numpy(np.concatenate([pd.get_dummies(train.type).values,norm],1).astype(np.float32))

In [22]:
with open('../Data/test_data_attention_edge.pickle', 'wb') as handle:
    pickle.dump(edge_X, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
ind = torch.from_numpy(train.loc[:,['atom_index_0','atom_index_1']].values)

with open('../Data/test_data_ind.pickle', 'wb') as handle:
    pickle.dump(ind, handle, protocol=pickle.HIGHEST_PROTOCOL)