In [None]:
import pickle
import torch
from torch_geometric.data import Data,DataLoader
from functions_refactor import *
from pytorch_util import *
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
# fixed parameters
head_mol,head_atom,head_edge = head_mol,head_atom,head_edge2
clip = 2
batch_size = 32
threshold = -1.3
reuse = False
lr = 1e-4

In [None]:
# changing parameters
block = NNConv_block
head = SimplyInteraction_noEdge
data = '../Data/{}_data_ACSF_SOAP_atomInfo_otherInfo.pickle'
dim = 512
logLoss = True
weight = 0.6
layer1 = 3
layer2 = 3
factor = 2
epochs = 120
aggr = 'max'
interleave = False

In [None]:
prefix = '_'.join([str(i).split('}')[1] if '}' in str(i) else str(i) \
                                        for i in [block,head,data,dim,logLoss,weight,layer1,layer2,factor,epochs,aggr,interleave]])

In [None]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [None]:
folds = []
for f in range(5):
    with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(f)+'.pickle', 'rb') as handle:
        folds.append(pickle.load(handle))
folds = [[Data(**d) for d in fold] for fold in folds]

In [None]:
for i in range(5):
    print('\nstart fold '+str(i))
    # parpare data
    train_list = []
    val_list = []
    for j in range(5):
        if i == j:
            val_list.extend(folds[j])
        else:
            train_list.extend(folds[j])
    
    train_dl = DataLoader(train_list,batch_size,shuffle=True)
    val_dl = DataLoader(val_list,batch_size,shuffle=False)
    
    # train model
    model = GNN_multiHead_noEdge(reuse,block,head,head_mol,head_atom,head_edge,\
                          dim,layer1,layer2,factor,**data_dict[data],aggr=aggr,interleave=interleave).to('cuda:0')
    paras = trainable_parameter(model)
    opt = Adam(paras,lr=lr)
    scheduler = ReduceLROnPlateau(opt, 'min',factor=0.5,patience=5,min_lr=1e-05)
    
    model,train_loss_perType,val_loss_perType,bestWeight = train_type_earlyStop_5fold(opt,model,epochs_type,train_dl,val_dl,paras,clip,\
                                                                    scheduler=scheduler,logLoss=logLoss,weight=weight,patience=8)
    torch.save({'model_state_dict_type_'+str(j_):w for j_,w in enumerate(bestWeight)},\
                '../Model/'+prefix+'_fold'+str(i)+'.tar')
    # predict oof for each type
    for type_i in range(8):
        # load val data and type_id
        with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(i)+'_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_data = pickle.load(handle)
        test_list = [Data(**d) for d in test_data]
        test_dl = DataLoader(test_list,batch_size,shuffle=False)
        
        with open(data.format('train').split('pickle')[0][:-1]+'_f'+str(i)+'_type_'+str(type_i)+'_id.pickle', 'rb') as handle:
            test_id = pickle.load(handle)
    
        # load model
        model.load_state_dict(bestWeight[type_i])
    
        # predict
        model.eval()
        yhat_list = []
        with torch.no_grad():
            for data_torch in test_dl:
                data_torch = data_torch.to('cuda:0')
                yhat_list.append(model(data_torch,False,True))
        yhat = torch.cat(yhat_list).cpu().detach().numpy()        
    
        # join
        assert yhat.shape[0]==test_id.shape[0],'yhat and test_id should have same shape'
        submit_ = dict(zip(test_id,yhat))
        train_df['fold'+str(i)+'_type'+str(type_i)] = train_df.id.map(submit_)
    
    # predict test
    for type_i in range(8):
        # load val data and type_id
        with open(data.format('test').split('pickle')[0][:-1]+'_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_data = pickle.load(handle)
        test_list = [Data(**d) for d in test_data]
        test_dl = DataLoader(test_list,batch_size,shuffle=False)
        
        with open(data.format('test').split('pickle')[0][:-1]+'_id_type_'+str(type_i)+'.pickle', 'rb') as handle:
            test_id = pickle.load(handle)
    
        # load model
        model.load_state_dict(bestWeight[type_i])
    
        # predict
        model.eval()
        yhat_list = []
        with torch.no_grad():
            for data_torch in test_dl:
                data_torch = data_torch.to('cuda:0')
                yhat_list.append(model(data_torch,False,True))
        yhat = torch.cat(yhat_list).cpu().detach().numpy()        
    
        # join
        assert yhat.shape[0]==test_id.shape[0],'yhat and test_id should have same shape'
        submit_ = dict(zip(test_id,yhat))
        test_df['fold'+str(i)+'_type'+str(type_i)] = test_df.id.map(submit_)

In [None]:
#assert set(test.iloc[:,5:].isnull().sum(1)) == set([7*5])
test_df['yhat'] = np.nanmean(test_df.iloc[:,5:],1)
#test = test[['id','yhat']]
test_df.to_csv('../Data/test_oof_'+prefix,index=False)

#assert set(train.iloc[:,6:].isnull().sum(1)) == set([train.iloc[:,6:].shape[1]-1])
train_df['yhat'] = np.nanmean(train_df.iloc[:,6:],1)
#train = train[['id','yhat']]
train_df.to_csv('../Data/train_oof_'+prefix,index=False)