In [1]:
import pandas as pd
import numpy as np
import pickle
import copy

In [2]:
prefix = 'data_ACSF_expand_PCA'

In [3]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

### node information

In [4]:
with open('../Data/structures_dict_ACSF_PCA.pickle', 'rb') as handle:
    structures_dict = pickle.load(handle)

### bond information

In [5]:
with open('../Data/bonds_edge_index.pickle', 'rb') as handle:
    bonds_edge_index = pickle.load(handle)
with open('../Data/bonds_edge_attr_expand.pickle', 'rb') as handle:
    bonds_edge_attr = pickle.load(handle)

### coupling information

In [6]:
with open('../Data/coupling_edge_index.pickle', 'rb') as handle:
    coupling_edge_index = pickle.load(handle)
with open('../Data/coupling_edge_attr.pickle', 'rb') as handle:
    coupling_edge_attr = pickle.load(handle)
with open('../Data/coupling_edge_dist_expand.pickle', 'rb') as handle:
    coupling_edge_dist = pickle.load(handle)
with open('../Data/coupling_y.pickle', 'rb') as handle:
    coupling_y = pickle.load(handle)
with open('../Data/coupling_id.pickle', 'rb') as handle:
    coupling_id = pickle.load(handle)    

In [7]:
train_mol = np.unique(train.molecule_name)
test_mol = np.unique(test.molecule_name)

train_mol = np.random.permutation(train_mol)

mol_f0,mol_f1,mol_f2,mol_f3,mol_f4 = train_mol[:17000],train_mol[17000:17000*2],train_mol[17000*2:17000*3],\
                                     train_mol[17000*3:17000*4],train_mol[17000*4:]

In [10]:
def create_data(mols,IsTrain):
    type_list = [[] for _ in range(8)]
    tot_list = []

    test_id_type_list = [[] for _ in range(8)]
    test_id_list = []
        
    for m in mols:
        if IsTrain:
            dict_ = {'x':structures_dict[m],'edge_index':bonds_edge_index[m],\
                               'edge_attr':bonds_edge_attr[m],'y':coupling_y[m],\
                               'edge_index3':coupling_edge_index[m],'edge_attr3':coupling_edge_attr[m],\
                               'edge_attr4':coupling_edge_dist[m]}
            tot_list.append(copy.deepcopy(dict_))
            test_id_list.append(coupling_id[m])
            
            temp = dict_['edge_attr3'].argmax(1)
            for i in np.nonzero(dict_['edge_attr3'].sum(0))[0]:
                dict_['type_attr'] = (temp==i).astype(np.uint8)
                type_list[i].append(copy.deepcopy(dict_))
                test_id_type_list[i].append(coupling_id[m][temp==i])
        else:
            dict_ = {'x':structures_dict[m],'edge_index':bonds_edge_index[m],\
                       'edge_attr':bonds_edge_attr[m],\
                       'edge_index3':coupling_edge_index[m],'edge_attr3':coupling_edge_attr[m],\
                       'edge_attr4':coupling_edge_dist[m]}
            tot_list.append(copy.deepcopy(dict_))
            test_id_list.append(coupling_id[m])
            
            temp = dict_['edge_attr3'].argmax(1)
            for i in np.nonzero(dict_['edge_attr3'].sum(0))[0]:
                dict_['type_attr'] = (temp==i).astype(np.uint8)
                type_list[i].append(copy.deepcopy(dict_))
                test_id_type_list[i].append(coupling_id[m][temp==i])
    
    return tot_list,type_list,np.concatenate(test_id_list),[np.concatenate(type_i) for type_i in test_id_type_list]

In [11]:
tot_list_f0,type_list_f0,tot_id_f0,type_id_f0 = create_data(mol_f0,True)
tot_list_f1,type_list_f1,tot_id_f1,type_id_f1 = create_data(mol_f1,True)
tot_list_f2,type_list_f2,tot_id_f2,type_id_f2 = create_data(mol_f2,True)
tot_list_f3,type_list_f3,tot_id_f3,type_id_f3 = create_data(mol_f3,True)
tot_list_f4,type_list_f4,tot_id_f4,type_id_f4 = create_data(mol_f4,True)

In [16]:
# convert numpy array to torch array
import torch
tot_list_f0 = [{k:torch.tensor(i[k]) for k in i.keys()} for i in tot_list_f0]
tot_list_f1 = [{k:torch.tensor(i[k]) for k in i.keys()} for i in tot_list_f1]
tot_list_f2 = [{k:torch.tensor(i[k]) for k in i.keys()} for i in tot_list_f2]
tot_list_f3 = [{k:torch.tensor(i[k]) for k in i.keys()} for i in tot_list_f3]
tot_list_f4 = [{k:torch.tensor(i[k]) for k in i.keys()} for i in tot_list_f4]

In [17]:
def numpy2torch(type_list):
    out = []
    for type_ in type_list:
        out.append([{k:torch.tensor(i[k]) for k in i.keys()} for i in type_])
    return out

In [18]:
type_list_f0 = numpy2torch(type_list_f0)
type_list_f1 = numpy2torch(type_list_f1)
type_list_f2 = numpy2torch(type_list_f2)
type_list_f3 = numpy2torch(type_list_f3)
type_list_f4 = numpy2torch(type_list_f4)

In [19]:
with open('../Data/train_'+prefix+'_f0.pickle', 'wb') as handle:
    pickle.dump(tot_list_f0, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f1.pickle', 'wb') as handle:
    pickle.dump(tot_list_f1, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f2.pickle', 'wb') as handle:
    pickle.dump(tot_list_f2, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f3.pickle', 'wb') as handle:
    pickle.dump(tot_list_f3, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f4.pickle', 'wb') as handle:
    pickle.dump(tot_list_f4, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [23]:
def save_type(prefix,type_list,postfix=''):
    for i,type_ in enumerate(type_list):
        with open(prefix+'_type_'+str(i)+postfix+'.pickle', 'wb') as handle:
            pickle.dump(type_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
save_type('../Data/train_'+prefix+'_f0',type_list_f0)
save_type('../Data/train_'+prefix+'_f1',type_list_f1)
save_type('../Data/train_'+prefix+'_f2',type_list_f2)
save_type('../Data/train_'+prefix+'_f3',type_list_f3)
save_type('../Data/train_'+prefix+'_f4',type_list_f4)

In [22]:
with open('../Data/train_'+prefix+'_f0_id.pickle', 'wb') as handle:
    pickle.dump(tot_id_f0, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f1_id.pickle', 'wb') as handle:
    pickle.dump(tot_id_f1, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f2_id.pickle', 'wb') as handle:
    pickle.dump(tot_id_f2, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f3_id.pickle', 'wb') as handle:
    pickle.dump(tot_id_f3, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('../Data/train_'+prefix+'_f4_id.pickle', 'wb') as handle:
    pickle.dump(tot_id_f4, handle, protocol=pickle.HIGHEST_PROTOCOL)    

save_type('../Data/train_'+prefix+'_f0',type_id_f0,'_id')
save_type('../Data/train_'+prefix+'_f1',type_id_f1,'_id')
save_type('../Data/train_'+prefix+'_f2',type_id_f2,'_id')
save_type('../Data/train_'+prefix+'_f3',type_id_f3,'_id')
save_type('../Data/train_'+prefix+'_f4',type_id_f4,'_id')