In [1]:
import pickle
import torch
from torch_geometric.data import Data,DataLoader
from functions_refactor import *
from pytorch_util import *
#from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
# fixed parameters
block = MEGNet_block
head_mol,head_atom,head_edge = head_mol,head_atom,head_edge
clip = 0.5
batch_size = 64
threshold = 1e3
reuse = False
lr = 1e-4

In [3]:
# changing parameters
head = SimplyInteraction
data = '../Data/{}_data_stacking_0815_wNodeInfo.pickle'
dim = 22
logLoss = False
layer = 4
factor = 3
epochs = 19
edge_in4 = 22
node_in = 32
aggr = 'max'

In [4]:
train_dl,val_dl = get_data(data,batch_size)

In [5]:
class GNN_multiHead_interleave_stacking(torch.nn.Module):
    def __init__(self,reuse,block,head,dim,layer,factor,\
                 edge_in4,node_in,edge_in3=8,aggr='mean'):
        # block,head are nn.Module
        # node_in,edge_in are dim for bonding and edge_in4,edge_in3 for coupling
        super(GNN_multiHead_interleave_stacking, self).__init__()
        
        self.lin_node = Sequential(BatchNorm1d(node_in),Linear(node_in, dim*factor),LeakyReLU(), \
                                   BatchNorm1d(dim*factor),Linear(dim*factor, dim),LeakyReLU())       
        if reuse:
            self.conv = block(dim=dim,aggr=aggr)
        else:
            self.conv = nn.ModuleList([block(dim=dim,aggr=aggr) for _ in range(layer)])        
        self.head = head(dim)
        
        
    def forward(self, data,IsTrain=False,typeTrain=False,logLoss=True,weight=None):
        out = self.lin_node(data.x)
        # edge_*3 only does not repeat for undirected graph. Hence need to add (j,i) to (i,j) in edges
        edge_index3 = torch.cat([data.edge_index3,data.edge_index3[[1,0]]],1)
        n = data.edge_attr3.shape[0]
        edge_attr3 = torch.cat([data.edge_attr4,data.edge_attr4],0)
          
        for conv in self.conv:
            out,edge_attr3 = conv(out,edge_index3,edge_attr3)

        
        edge_attr3 = edge_attr3[:n]

        if typeTrain:
            if IsTrain:
                y = data.y[data.type_attr]
            edge_attr3 = edge_attr3[data.type_attr]
            edge_index3 = data.edge_index3[:,data.type_attr]
            edge_attr3_old = data.edge_attr3[data.type_attr]
        else:
            if IsTrain:
                y = data.y
            edge_index3 = data.edge_index3
            edge_attr3_old = data.edge_attr3
            
        yhat = self.head(out,edge_index3,edge_attr3,edge_attr3_old)
        
        if IsTrain:
            k = torch.sum(edge_attr3_old,0)
            nonzeroIndex = torch.nonzero(k).squeeze(1)
            abs_ = torch.abs(y-yhat).unsqueeze(1)
            loss_perType = torch.zeros(8,device='cuda:0')
            if logLoss:
                loss_perType[nonzeroIndex] = torch.log(torch.sum(abs_ * edge_attr3_old[:,nonzeroIndex],0)/k[nonzeroIndex])
                loss = torch.sum(loss_perType)/nonzeroIndex.shape[0]
                return loss,loss_perType         
            else:
                loss_perType[nonzeroIndex] = torch.sum(abs_ * edge_attr3_old[:,nonzeroIndex],0)/k[nonzeroIndex]
                loss = torch.sum(loss_perType)/nonzeroIndex.shape[0]
                loss_perType[nonzeroIndex] = torch.log(loss_perType[nonzeroIndex])
                return loss,loss_perType
        else:
            return yhat

In [7]:
# layer = 4
model = GNN_multiHead_interleave_stacking(reuse,block,head,dim,layer,factor,edge_in4,node_in).to('cuda:0')
paras = trainable_parameter(model)
opt = RAdam(paras,lr=lr,weight_decay=1e-2)
scheduler = ReduceLROnPlateau(opt, 'min',factor=0.5,patience=5,min_lr=1e-05)

model,train_loss_perType,val_loss_perType,bestWeight = train_type_earlyStop(opt,model,epochs,train_dl,val_dl,paras,clip,\
                                                                scheduler=scheduler,logLoss=logLoss,threshold=threshold,typeTrain=False)


epoch:0, train_loss: +4.882, val_loss: -2.450, 
train_vector: +0.76|+0.17|-1.16|-1.34|-1.36|-1.14|-1.26|-1.85, 
val_vector  : -1.50|-1.96|-2.41|-3.18|-2.70|-2.19|-2.79|-2.88

epoch:1, train_loss: +0.104, val_loss: -2.530, 
train_vector: -1.38|-1.88|-2.44|-3.21|-2.71|-2.24|-2.86|-2.95, 
val_vector  : -1.48|-1.96|-2.48|-3.29|-2.86|-2.30|-2.86|-3.01

epoch:2, train_loss: +0.098, val_loss: -2.558, 
train_vector: -1.41|-1.93|-2.49|-3.26|-2.87|-2.30|-2.95|-3.09, 
val_vector  : -1.30|-1.85|-2.52|-3.33|-2.94|-2.34|-3.04|-3.16

epoch:3, train_loss: +0.096, val_loss: -2.591, 
train_vector: -1.41|-1.95|-2.52|-3.28|-2.92|-2.34|-3.01|-3.15, 
val_vector  : -1.51|-1.87|-2.53|-3.34|-2.88|-2.37|-3.05|-3.17

epoch:4, train_loss: +0.095, val_loss: -2.577, 
train_vector: -1.40|-1.95|-2.54|-3.29|-2.96|-2.37|-3.04|-3.17, 
val_vector  : -1.27|-1.75|-2.56|-3.35|-3.00|-2.38|-3.10|-3.20

epoch:5, train_loss: +0.095, val_loss: -2.646, 
train_vector: -1.39|-1.95|-2.55|-3.30|-2.98|-2.38|-3.07|-3.18, 
val_vector  :

In [27]:
with open('../Model/GNN_stacking_0815.pickle', 'wb') as handle:
    pickle.dump(bestWeight, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
with open('../Model/GNN_stacking_0815.pickle', 'rb') as handle:
    bestWeight = pickle.load(handle)

model = GNN_multiHead_interleave_stacking(reuse,block,head,dim,layer,factor,edge_in4).to('cuda:0')    

In [28]:
# predict test
for type_i in range(8):
    # load val data and type_id
    with open(data.format('test').split('pickle')[0][:-1]+'_type_'+str(type_i)+'.pickle', 'rb') as handle:
        test_data = pickle.load(handle)
    test_list = [Data(**d) for d in test_data]
    test_dl = DataLoader(test_list,batch_size,shuffle=False)

    with open(data.format('test').split('pickle')[0][:-1]+'_id_type_'+str(type_i)+'.pickle', 'rb') as handle:
        test_id = pickle.load(handle)

    # load model
    model.load_state_dict(bestWeight[type_i])

    # predict
    model.eval()
    yhat_list = []
    with torch.no_grad():
        for data_torch in test_dl:
            data_torch = data_torch.to('cuda:0')
            yhat_list.append(model(data_torch,False,typeTrain=True))
    yhat = torch.cat(yhat_list).cpu().detach().numpy()        

    # join
    assert yhat.shape[0]==test_id.shape[0],'yhat and test_id should have same shape'
    submit_ = dict(zip(test_id,yhat))
    test_df['type'+str(type_i)] = test_df.id.map(submit_)
    #test_df['fold'+str(i)+'_type'+str(type_i)] = test_df.id.map(submit_)

In [37]:
test_df['scalar_coupling_constant'] = np.nanmean(test_df.iloc[:,5:],1)
#test = test[['id','yhat']]
test_df[['id','scalar_coupling_constant']].to_csv('../Submission/GNN_stacking_0815',index=False)