In [1]:
import pickle
import torch
from torch_geometric.data import Data,DataLoader
from functions_refactor import *
from pytorch_util import *
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
# fixed parameters
block = MEGNet_block
head_mol,head_atom,head_edge = head_mol,head_atom,head_edge
clip = 2
batch_size = 32
threshold = -1.2
reuse = False
lr = 1e-4

In [None]:
# changing parameters
head = SimplyInteraction
data = '../Data/{}_data_ACSF_SOAP_atomInfo_otherInfo.pickle'
dim = 512
logLoss = True
weight = 0.6
layer1 = 4
layer2 = 3
factor = 2
epochs = 100
aggr = 'max'
interleave = False

In [None]:
prefix = '_'.join([str(i).split('}')[1] if '}' in str(i) else str(i) \
                                        for i in [head,data,dim,logLoss,weight,layer1,layer2,factor,epochs,aggr,interleave]])

In [None]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [None]:
folds = []
for f in range(5):
    with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(f)+'.pickle', 'rb') as handle:
        folds.append(pickle.load(handle))
folds = [[Data(**d) for d in fold] for fold in folds]

In [None]:
def get_intermediate_output(model,dl) :
    # for GNN_multiHead_interleave
    outputs = []
    model.eval()
    with torch.no_grad():
        for data in dl:
            data = data.to('cuda:0')
            out = model.lin_node(data.x)
            edge_attr = model.edge1(data.edge_attr)
            out,_ = model.conv1[0](out,data.edge_index,edge_attr)
            temp = out[data.edge_index3[:,data.type_attr]] # (2,N,d)
            out = torch.cat([temp[0],temp[1]],1)
            outputs.append(out.cpu().detach().numpy())
    return np.concatenate(outputs)

In [None]:
oof_features = []
for i in range(5):
    print('\nstart fold '+str(i))
    # parpare data
    train_list = []
    val_list = []
    for j in range(5):
        if i == j:
            val_list.extend(folds[j])
        else:
            train_list.extend(folds[j])
    
    train_dl = DataLoader(train_list,batch_size,shuffle=True)
    val_dl = DataLoader(val_list,batch_size,shuffle=False)
    
    # train model
    model = GNN_multiHead_interleave(reuse,block,head,head_mol,head_atom,head_edge,\
                          dim,layer1,layer2,factor,**data_dict[data],aggr=aggr,interleave=interleave).to('cuda:0')
    
    checkpoint = torch.load('../Model/'+prefix+'_fold'+str(i)+'.tar')

    # predict oof for each type
    for type_i in range(8):
        
        # load val data and type_id
        with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(i)+'_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_data = pickle.load(handle)
        test_list = [Data(**d) for d in test_data]
        test_dl = DataLoader(test_list,batch_size,shuffle=False)
        
        with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(i)+'_type_'+str(type_i)+'_id.pickle', 'rb') as handle:
            test_id = pickle.load(handle)
    
        # load model
        model.load_state_dict(checkpoint['model_state_dict_type_'+str(type_i)])
    
        # predict
        yhat = get_intermediate_output(model,test_dl)    
    
        # join
        assert yhat.shape[0]==test_id.shape[0],'yhat and test_id should have same shape'
        oof_features.append(pd.DataFrame(np.concatenate([test_id.reshape(-1,1),yhat],1),columns=['id']+['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]))
    
    test_features = []
    # predict test
    for type_i in range(8):
        # load val data and type_id
        with open(data.format('test').split('pickle')[0][:-1]+'_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_data = pickle.load(handle)
        test_list = [Data(**d) for d in test_data]
        test_dl = DataLoader(test_list,batch_size,shuffle=False)
        
        with open(data.format('test').split('pickle')[0][:-1]+'_id_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_id = pickle.load(handle)
    
        # load model
        model.load_state_dict(checkpoint['model_state_dict_type_'+str(type_i)])
    
        # predict
        yhat = get_intermediate_output(model,test_dl)          
    
        # join
        assert yhat.shape[0]==test_id.shape[0],'yhat and test_id should have same shape'
        test_features.append(pd.DataFrame(np.concatenate([test_id.reshape(-1,1),yhat],1),columns=['id']+['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]))
    
    if i == 0:
        test_oof_df = pd.concat(test_features, ignore_index=True)
    else:
        temp_df = pd.concat(test_features, ignore_index=True)
        test_oof_df.loc[:,['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]] = test_oof_df.loc[:,['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]].values \
                                                                                            + temp_df.loc[:,['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]].values

In [4]:
oof_df = pd.concat(oof_features, ignore_index=True)
n_ = train_df.shape[0]
train_df = pd.merge(train_df,oof_df,how='inner',on='id')
m_ = train_df.shape[0]
assert n_==m_,"oof id should match train id"

In [11]:
test_oof_df.loc[:,['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]] = test_oof_df.loc[:,['feature_'+str(feature_i) for feature_i in range(yhat.shape[1])]]/5
n_ = test_df.shape[0]
test_df = pd.merge(test_df,test_oof_df,how='inner',on='id')
m_ = test_df.shape[0]
assert n_==m_,"oof id should match train id"

In [None]:
test_df.to_csv('../Data/test_oof_features_'+prefix,index=False)
train_df.to_csv('../Data/train_oof_features_'+prefix,index=False)