In [1]:
import pandas as pd
import numpy as np
import pickle
from ase import Atoms
from dscribe.descriptors import ACSF
import copy

In [2]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
structures = pd.read_csv('../Data/structures.csv')
train_bonds = pd.read_csv('../Data/train_bonds.csv')
test_bonds = pd.read_csv('../Data/test_bonds.csv')

In [3]:
test_bonds = test_bonds.drop('Unnamed: 0',1)
train_bonds = train_bonds.drop('Unnamed: 0',1)

### node information

In [4]:
with open('../Data/structures_dict_ACSF_3_4.pickle', 'rb') as handle:
    structures_dict = pickle.load(handle)

### bond information

In [5]:
with open('../Data/bonds_edge_index.pickle', 'rb') as handle:
    bonds_edge_index = pickle.load(handle)
with open('../Data/bonds_edge_attr.pickle', 'rb') as handle:
    bonds_edge_attr = pickle.load(handle)

In [4]:
n_points = 25
start1, stop1 = 0.6, 1.6
start2, stop2 = 1.0, 3.8

In [5]:
offset1 = np.linspace(start1,stop1,n_points)[np.newaxis,:]
coeff1 = -0.5/np.power(offset1[0,0] - offset1[0,1], 2)
offset2 = np.linspace(start2,stop2,n_points)[np.newaxis,:]
coeff2 = -0.5/np.power(offset2[0,0] - offset2[0,1], 2)

In [42]:
for k,v in bonds_edge_attr.items():
    gauss = np.exp(coeff1 * np.power((v[:,0:1]-offset1), 2))
    bonds_edge_attr[k] = np.concatenate([v,gauss],1)

In [44]:
with open('../Data/bonds_edge_attr_expand.pickle', 'wb') as handle:
    pickle.dump(bonds_edge_attr, handle, protocol=pickle.HIGHEST_PROTOCOL)    

### coupling information

In [2]:
with open('../Data/coupling_edge_index.pickle', 'rb') as handle:
    coupling_edge_index = pickle.load(handle)
with open('../Data/coupling_edge_attr.pickle', 'rb') as handle:
    coupling_edge_attr = pickle.load(handle)
with open('../Data/coupling_edge_dist.pickle', 'rb') as handle:
    coupling_edge_dist = pickle.load(handle)
with open('../Data/coupling_y.pickle', 'rb') as handle:
    coupling_y = pickle.load(handle)
with open('../Data/coupling_id.pickle', 'rb') as handle:
    coupling_id = pickle.load(handle)    

In [6]:
for k,v in coupling_edge_dist.items():
    gauss = np.exp(coeff2 * np.power((v[:,0:1]-offset2), 2))
    coupling_edge_dist[k] = np.concatenate([v,gauss],1)

In [9]:
with open('../Data/coupling_edge_dist_expand.pickle', 'wb') as handle:
    pickle.dump(coupling_edge_dist, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [56]:
train_mol = np.unique(train.molecule_name)
test_mol = np.unique(test.molecule_name)

In [57]:
train_mol = np.random.permutation(train_mol)

In [58]:
train_mol2 = train_mol[:70000]
val_mol = train_mol[70000:]
train_mol = train_mol2

In [59]:
def create_data(mols,IsTrain):
    type_list = [[] for _ in range(8)]
    tot_list = []
    if not IsTrain:
        test_id_type_list = [[] for _ in range(8)]
        test_id_list = []
        
    for m in mols:
        if IsTrain:
            dict_ = {'x':structures_dict[m],'edge_index':bonds_edge_index[m],\
                               'edge_attr':bonds_edge_attr[m],'y':coupling_y[m],\
                               'edge_index3':coupling_edge_index[m],'edge_attr3':coupling_edge_attr[m],\
                               'edge_attr4':coupling_edge_dist[m]}
            tot_list.append(copy.deepcopy(dict_))
                        
            temp = dict_['edge_attr3'].argmax(1)
            for i in np.nonzero(dict_['edge_attr3'].sum(0))[0]:
                dict_['type_attr'] = (temp==i).astype(np.uint8)
                type_list[i].append(copy.deepcopy(dict_))
        else:
            dict_ = {'x':structures_dict[m],'edge_index':bonds_edge_index[m],\
                       'edge_attr':bonds_edge_attr[m],\
                       'edge_index3':coupling_edge_index[m],'edge_attr3':coupling_edge_attr[m],\
                       'edge_attr4':coupling_edge_dist[m]}
            tot_list.append(copy.deepcopy(dict_))
            test_id_list.append(coupling_id[m])
            
            temp = dict_['edge_attr3'].argmax(1)
            for i in np.nonzero(dict_['edge_attr3'].sum(0))[0]:
                dict_['type_attr'] = (temp==i).astype(np.uint8)
                type_list[i].append(copy.deepcopy(dict_))
                test_id_type_list[i].append(coupling_id[m][temp==i])
    
    if IsTrain:
        return tot_list,type_list 
    else:
        return tot_list,type_list,np.concatenate(test_id_list),[np.concatenate(type_i) for type_i in test_id_type_list]

In [60]:
tot_list_train,type_list_train = create_data(train_mol,True)
tot_list_val,type_list_val = create_data(val_mol,True)
tot_list_test,type_list_test,test_id,test_id_type = create_data(test_mol,False)

In [61]:
with open('../Data/train_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(tot_list_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/val_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(tot_list_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/test_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(tot_list_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [62]:
with open('../Data/test_data_ACSF_expand_id.pickle', 'wb') as handle:
    pickle.dump(test_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
def save_type(prefix,type_list):
    for i,type_ in enumerate(type_list):
        with open(prefix+'_type_'+str(i)+'.pickle', 'wb') as handle:
            pickle.dump(type_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
save_type('../Data/train_data_ACSF_expand',type_list_train)
save_type('../Data/val_data_ACSF_expand',type_list_val)
save_type('../Data/test_data_ACSF_expand',type_list_test)

In [65]:
save_type('../Data/test_data_ACSF_expand_id',test_id_type)

numpy -> torch

In [2]:
with open('../Data/train_data_ACSF_expand.pickle', 'rb') as handle:
    train_data = pickle.load(handle)
with open('../Data/val_data_ACSF_expand.pickle', 'rb') as handle:
    val_data = pickle.load(handle)
with open('../Data/test_data_ACSF_expand.pickle', 'rb') as handle:
    test_data = pickle.load(handle)

In [3]:
def load_type(prefix):
    data = []
    for i in range(8):
        with open(prefix+'_type_'+str(i)+'.pickle', 'rb') as handle:
            data.append(pickle.load(handle))
    return data

In [4]:
train_data_type = load_type('../Data/train_data_ACSF_expand')
val_data_type = load_type('../Data/val_data_ACSF_expand')
test_data_type = load_type('../Data/test_data_ACSF_expand')

In [5]:
# convert numpy array to torch array
import torch
train_data = [{k:torch.tensor(i[k]) for k in i.keys()} for i in train_data]
val_data = [{k:torch.tensor(i[k]) for k in i.keys()} for i in val_data]
test_data = [{k:torch.tensor(i[k]) for k in i.keys()} for i in test_data]

In [6]:
def numpy2torch(type_list):
    out = []
    for type_ in type_list:
        out.append([{k:torch.tensor(i[k]) for k in i.keys()} for i in type_])
    return out

In [7]:
train_data_type = numpy2torch(train_data_type)
val_data_type = numpy2torch(val_data_type)
test_data_type = numpy2torch(test_data_type)

In [8]:
with open('../Data/train_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/val_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/test_data_ACSF_expand.pickle', 'wb') as handle:
    pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
save_type('../Data/train_data_ACSF_expand',train_data_type)
save_type('../Data/val_data_ACSF_expand',val_data_type)
save_type('../Data/test_data_ACSF_expand',test_data_type)