In [1]:
import pandas as pd
import numpy as np
import pickle
import torch

In [2]:
train = pd.read_csv('../Data/train.csv')
#test = pd.read_csv('../Data/test.csv')
structures = pd.read_csv('../Data/structures.csv')

In [3]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [4]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

In [5]:
centers = (train.loc[:,['x_0','y_0','z_0']].values + train.loc[:,['x_1','y_1','z_1']].values)/2

In [6]:
vector = train.loc[:,['x_0','y_0','z_0']].values - train.loc[:,['x_1','y_1','z_1']].values

In [7]:
norm = np.linalg.norm(vector,axis=1)[:,None]
vector = vector/norm

In [12]:
structures[['C', 'F', 'H', 'N', 'O']] = pd.get_dummies(structures.atom)

In [13]:
structures_gb = structures.groupby(['molecule_name'])

In [14]:
structures_gb = {k:v.loc[:,['x','y','z','C', 'F', 'H', 'N', 'O']].values for k,v in structures_gb}

In [62]:
def create(mol,c,v):
    X = mol[:,:3] - c
    r = np.matmul(X,v)
    R = np.linalg.norm(X - r[:,None]*v,axis=1)
    out = np.concatenate([r[:,None],R[:,None],mol[:,3:],np.broadcast_to(d,mol.shape[0])[:,None]],1)
    return out

In [77]:
feature_list = []
for i in range(train.shape[0]):
    mol = structures_gb[train.iloc[i]['molecule_name']]
    c,v,d = centers[i],vector[i],norm[i]
    feature_list.append(create(mol,c,v))

In [85]:
def numpy2torch(x):
    return torch.from_numpy(x.astype(np.float32))

In [89]:
feature_list = [numpy2torch(i) for i in feature_list]

In [90]:
with open('../Data/train_data_attention_node.pickle', 'wb') as handle:
    pickle.dump(feature_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [95]:
edge_X = torch.from_numpy(np.concatenate([pd.get_dummies(train.type).values,norm],1).astype(np.float32))

In [99]:
with open('../Data/train_data_attention_edge.pickle', 'wb') as handle:
    pickle.dump(edge_X, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [101]:
edge_y = torch.from_numpy(train.scalar_coupling_constant.values.astype(np.float32))

In [103]:
with open('../Data/train_data_attention_edge_y.pickle', 'wb') as handle:
    pickle.dump(edge_y, handle, protocol=pickle.HIGHEST_PROTOCOL)