### training scheme
- (1) train denoising auto encoder model using all data including train and test data
- (2) from the weights of denoising auto encoder model, finetune to predict targets such as reactivity

### rough network architecture
- inputs -> conv1ds -> aggregation of neighborhoods -> multi head attention -> aggregation of neighborhoods -> multi head attention -> conv1d -> predict
- this architecture was inspired by https://www.kaggle.com/cpmpml/graph-transfomer


In [1]:
pretrain_dir = None # model dir for resuming training. if None, train from scrach

one_fold = False # if True, train model at only first fold. use if you try a new idea quickly.
run_test = False # if True, use small data. you can check whether this code run or not
denoise = True # if True, use train data whose signal_to_noise > 1

ae_epochs = 20 # epoch of training of denoising auto encoder
ae_epochs_each = 5 # epoch of training of denoising auto encoder each time. 
                   # I use train data (seqlen = 107) and private test data (seqlen = 130) for auto encoder training.
                   # I dont know how to easily fit keras model to use both of different shape data simultaneously, 
                   # so I call fit function several times. 
ae_batch_size = 32

# epochs_list = [30, 10, 3, 3, 5, 5]
epochs_list = [30, 10, 5, 5, 8, 8]
batch_size_list = [8, 16, 32, 64, 128, 256] 

## copy pretrain model to working dir
import shutil
import glob
import ast
if pretrain_dir is not None:
    for d in glob.glob(pretrain_dir + "*"):
        shutil.copy(d, ".")
    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
%matplotlib inline

## load

In [2]:
aug_df = pd.read_csv('../OpenVaccine/aug_data1.csv')
def aug_data(df):
    target_df = df.copy()
    new_df = aug_df[aug_df['id'].isin(target_df['id'])]
                         
    del target_df['structure']
    del target_df['predicted_loop_type']
    new_df = new_df.merge(target_df, on=['id','sequence'], how='left')

#     df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
#     df['log_gamma'] = 100
#     df['score'] = 1.0
    df = df.append(new_df[df.columns])
    return df

In [3]:
pseudo_df = pd.read_csv('../OpenVaccine/pseudo_test.csv')

pseudo_df['reactivity'] = pseudo_df['reactivity'].apply(lambda x: ast.literal_eval(x))
pseudo_df['deg_Mg_pH10'] = pseudo_df['deg_Mg_pH10'].apply(lambda x: ast.literal_eval(x))
pseudo_df['deg_pH10'] = pseudo_df['deg_pH10'].apply(lambda x: ast.literal_eval(x))
pseudo_df['deg_Mg_50C'] = pseudo_df['deg_Mg_50C'].apply(lambda x: ast.literal_eval(x))
pseudo_df['deg_50C'] = pseudo_df['deg_50C'].apply(lambda x: ast.literal_eval(x))

In [4]:
pseudo_st = pseudo_df[pseudo_df['seq_length'] == 107]
pseudo_lg = pseudo_df[pseudo_df['seq_length'] == 130]

In [5]:
pseudo_st['reactivity'] = pseudo_st['reactivity'].apply(lambda x: x[:68])
pseudo_st['deg_Mg_pH10'] = pseudo_st['deg_Mg_pH10'].apply(lambda x: x[:68])
pseudo_st['deg_pH10'] = pseudo_st['deg_pH10'].apply(lambda x: x[:68])
pseudo_st['deg_Mg_50C'] = pseudo_st['deg_Mg_50C'].apply(lambda x: x[:68])
pseudo_st['deg_50C'] = pseudo_st['deg_50C'].apply(lambda x: x[:68])

pseudo_lg['reactivity'] = pseudo_lg['reactivity'].apply(lambda x: x[:91])
pseudo_lg['deg_Mg_pH10'] = pseudo_lg['deg_Mg_pH10'].apply(lambda x: x[:91])
pseudo_lg['deg_pH10'] = pseudo_lg['deg_pH10'].apply(lambda x: x[:91])
pseudo_lg['deg_Mg_50C'] = pseudo_lg['deg_Mg_50C'].apply(lambda x: x[:91])
pseudo_lg['deg_50C'] = pseudo_lg['deg_50C'].apply(lambda x: x[:91])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [6]:
pseudo_lg.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
1,1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91,"[0.5988852160599594, 1.252383497027623, 1.1766...","[0.5802506685595211, 1.9965426237137536, 0.765...","[1.6855349656016847, 1.9856331898502928, 0.841...","[0.4643303444983725, 1.7353359778870612, 0.980...","[0.5961263638596626, 1.330144615692037, 0.9792..."
4,4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91,"[0.751697894404419, 1.8868896799083001, 1.3745...","[0.7895877059930309, 2.8275117665075093, 0.935...","[1.467933834294549, 2.6756719629474697, 1.0947...","[0.5991996450286899, 2.477975121643563, 1.2246...","[0.5419860902101901, 1.5332587823832124, 1.105..."
5,5,id_002852873,GGAAAGCGAAACGCCGAGAAGACGUAGUUCGCAGAGCGGCGUACCC...,.....(((...(((......(((((((((.((....(((.....))...,EEEEESSSIIISSSBBBBBBSSSSSSSSSBSSBBBBSSSHHHHHSS...,130,91,"[0.5125200536210608, 1.19156364060605, 1.03744...","[0.5863812436307109, 1.434125824290471, 1.0608...","[1.334704451012295, 1.6699436537913352, 1.0838...","[0.5100052155797521, 1.4584923193610004, 1.439...","[0.5048366482863921, 1.1946691243009842, 0.988..."
6,6,id_0031191b7,GGAAAUGUCUACAUAGGAGUGCUGCGGGACGGUAACGUCAUGACCG...,........(((((((((((.((..(((((((....))))....)))...,EEEEEEEESSSSSSSSSSSISSIISSSSSSSHHHHSSSSBBBBSSS...,130,91,"[0.731134934164374, 1.9836055225668456, 1.5687...","[0.6598995646228629, 2.707940415445323, 0.5863...","[1.7225471233757963, 3.3154216130008005, 0.689...","[0.5218093358218737, 2.899379728487696, 0.7624...","[0.6478372665491631, 2.319938174531894, 0.7588..."
7,7,id_003ab2445,GGAAAGACUCAGAGGUGAAGGUCAUCACGGCUGAUAGGAGACUAUC...,.....(((((..........((((((.((.(((((((....)))))...,EEEEESSSSSBBBBBBBBBBSSSSSSISSISSSSSSSHHHHSSSSS...,130,91,"[0.7291145234939289, 1.842581802194789, 1.5406...","[0.8392257920695343, 2.091502298209419, 1.5503...","[1.9666465903097312, 2.4865346681802207, 1.302...","[0.6607922392751232, 1.9739020146403017, 1.861...","[0.6407883042621931, 1.4538999646879156, 1.397..."


In [7]:
import json
import glob
from tqdm.notebook import tqdm

train = pd.read_json("../OpenVaccine/train.json",lines=True)
train = aug_data(train)
if denoise:
    train = train[train.signal_to_noise > 1].reset_index(drop = True)
    
train = train.append(pseudo_st)
train_lg = pseudo_lg
test  = pd.read_json("../OpenVaccine/test.json",lines=True)
test = aug_data(test)

test_pub = test[test["seq_length"] == 107]
test_pri = test[test["seq_length"] == 130]
sub = pd.read_csv("../OpenVaccine/sample_submission.csv")

if run_test: ## to test 
    train = train[:30]
    test_pub = test_pub[:30]
    test_pri = test_pri[:30]

As = []
for id in tqdm(train["id"]):
    a = np.load(f"../OpenVaccine/bpps/{id}.npy")
    As.append(a)
As = np.array(As)

As_lg = []
for id in tqdm(train_lg["id"]):
    a = np.load(f"../OpenVaccine/bpps/{id}.npy")
    As_lg.append(a)
As_lg = np.array(As_lg)

As_pub = []
for id in tqdm(test_pub["id"]):
    a = np.load(f"../OpenVaccine/bpps/{id}.npy")
    As_pub.append(a)
As_pub = np.array(As_pub)

As_pri = []
for id in tqdm(test_pri["id"]):
    a = np.load(f"../OpenVaccine/bpps/{id}.npy")
    As_pri.append(a)
As_pri = np.array(As_pri)

HBox(children=(FloatProgress(value=0.0, max=4821.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3005.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1258.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6010.0), HTML(value='')))




## target

In [8]:
targets = list(sub.columns[1:])
print(targets)

y_train = []
seq_len = train["seq_length"].iloc[0]
seq_len_target = train["seq_scored"].iloc[0]
ignore = -10000
ignore_length = seq_len - seq_len_target
for target in targets:
    y_tmp = np.vstack(train[target])
    dummy = np.zeros([y_tmp.shape[0], ignore_length]) + ignore
    y_tmp = np.hstack([y_tmp, dummy])
    y_train.append(y_tmp)
y = np.stack(y_train, axis = 2)
print(y.shape)

y_train = []
seq_len = train_lg["seq_length"].iloc[0]
seq_len_target = train_lg["seq_scored"].iloc[0]
ignore = -10000
ignore_length = seq_len - seq_len_target
for target in targets:
    y_tmp = np.vstack(train_lg[target])
    dummy = np.zeros([y_tmp.shape[0], ignore_length]) + ignore
    y_tmp = np.hstack([y_tmp, dummy])
    y_train.append(y_tmp)
y_lg = np.stack(y_train, axis = 2)
print(y_lg.shape)

['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
(4821, 107, 5)
(3005, 130, 5)


## structure adj

In [9]:
def get_structure_adj(train):
    ## get adjacent matrix from structure sequence
    
    ## here I calculate adjacent matrix of each base pair, 
    ## but eventually ignore difference of base pair and integrate into one matrix
    Ss = []
    for i in tqdm(range(len(train))):
        seq_length = train["seq_length"].iloc[i]
        structure = train["structure"].iloc[i]
        sequence = train["sequence"].iloc[i]

        cue = []
        a_structures = {
            ("A", "U") : np.zeros([seq_length, seq_length]),
            ("C", "G") : np.zeros([seq_length, seq_length]),
            ("U", "G") : np.zeros([seq_length, seq_length]),
            ("U", "A") : np.zeros([seq_length, seq_length]),
            ("G", "C") : np.zeros([seq_length, seq_length]),
            ("G", "U") : np.zeros([seq_length, seq_length]),
        }
        a_structure = np.zeros([seq_length, seq_length])
        for i in range(seq_length):
            if structure[i] == "(":
                cue.append(i)
            elif structure[i] == ")":
                start = cue.pop()
#                 a_structure[start, i] = 1
#                 a_structure[i, start] = 1
                a_structures[(sequence[start], sequence[i])][start, i] = 1
                a_structures[(sequence[i], sequence[start])][i, start] = 1
        
        a_strc = np.stack([a for a in a_structures.values()], axis = 2)
        a_strc = np.sum(a_strc, axis = 2, keepdims = True)
        Ss.append(a_strc)
    
    Ss = np.array(Ss)
    print(Ss.shape)
    return Ss
Ss = get_structure_adj(train)
Ss_lg = get_structure_adj(train_lg)
Ss_pub = get_structure_adj(test_pub)
Ss_pri = get_structure_adj(test_pri)

HBox(children=(FloatProgress(value=0.0, max=4821.0), HTML(value='')))


(4821, 107, 107, 1)


HBox(children=(FloatProgress(value=0.0, max=3005.0), HTML(value='')))


(3005, 130, 130, 1)


HBox(children=(FloatProgress(value=0.0, max=1258.0), HTML(value='')))


(1258, 107, 107, 1)


HBox(children=(FloatProgress(value=0.0, max=6010.0), HTML(value='')))


(6010, 130, 130, 1)


## distance adj

In [10]:
def get_distance_matrix(As):
    ## adjacent matrix based on distance on the sequence
    ## D[i, j] = 1 / (abs(i - j) + 1) ** pow, pow = 1, 2, 4
    
    idx = np.arange(As.shape[1])
    Ds = []
    for i in range(len(idx)):
        d = np.abs(idx[i] - idx)
        Ds.append(d)

    Ds = np.array(Ds) + 1
    Ds = 1/Ds
    Ds = Ds[None, :,:]
    Ds = np.repeat(Ds, len(As), axis = 0)
    
    Dss = []
    for i in [1, 2, 4]: 
        Dss.append(Ds ** i)
    Ds = np.stack(Dss, axis = 3)
    print(Ds.shape)
    return Ds

Ds = get_distance_matrix(As)
Ds_lg = get_distance_matrix(As_lg)
Ds_pub = get_distance_matrix(As_pub)
Ds_pri = get_distance_matrix(As_pri)

(4821, 107, 107, 3)
(3005, 130, 130, 3)
(1258, 107, 107, 3)
(6010, 130, 130, 3)


In [11]:
## concat adjecent
As = np.concatenate([As[:,:,:,None], Ss, Ds], axis = 3).astype(np.float32)
As_lg = np.concatenate([As_lg[:,:,:,None], Ss_lg, Ds_lg], axis = 3).astype(np.float32)

As_pub = np.concatenate([As_pub[:,:,:,None], Ss_pub, Ds_pub], axis = 3).astype(np.float32)
As_pri = np.concatenate([As_pri[:,:,:,None], Ss_pri, Ds_pri], axis = 3).astype(np.float32)
del Ss, Ds, Ss_lg, Ds_lg, Ss_pub, Ds_pub, Ss_pri, Ds_pri
As.shape, As_lg.shape, As_pub.shape, As_pri.shape

((4821, 107, 107, 5),
 (3005, 130, 130, 5),
 (1258, 107, 107, 5),
 (6010, 130, 130, 5))

## node

In [12]:
## sequence
def return_ohe(n, i):
    tmp = [0] * n
    tmp[i] = 1
    return tmp

def get_input(train):
    ## get node features, which is one hot encoded
    mapping = {}
    vocab = ["A", "G", "C", "U"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_node = np.stack(train["sequence"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))

    mapping = {}
    vocab = ["S", "M", "I", "B", "H", "E", "X"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_loop = np.stack(train["predicted_loop_type"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))
    
    mapping = {}
    vocab = [".", "(", ")"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_structure = np.stack(train["structure"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))
    
    
    X_node = np.concatenate([X_node, X_loop], axis = 2)
    
    ## interaction
    a = np.sum(X_node * (2 ** np.arange(X_node.shape[2])[None, None, :]), axis = 2)
    vocab = sorted(set(a.flatten()))
    print(vocab)
    ohes = []
    for v in vocab:
        ohes.append(a == v)
    ohes = np.stack(ohes, axis = 2)
    X_node = np.concatenate([X_node, ohes], axis = 2).astype(np.float32)
    
    
    print(X_node.shape)
    return X_node

X_node = get_input(train)
X_node_lg = get_input(train_lg)
X_node_pub = get_input(test_pub)
X_node_pri = get_input(test_pri)

[17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136, 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
(4821, 107, 39)
[17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136, 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
(3005, 130, 39)
[17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136, 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
(1258, 107, 39)
[17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136, 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
(6010, 130, 39)


## model

In [13]:
import tensorflow as tf
from tensorflow.keras import layers as L
import tensorflow_addons as tfa
from tensorflow.keras import backend as K

def mcrmse(t, p):
    ## calculate mcrmse score by using numpy
    if t.shape[1] == 107:
        seq_len_target = 68
    elif t.shape[1] == 130:
        seq_len_target = 91
        
    t = t[:, :seq_len_target]
    p = p[:, :seq_len_target]
    
    score = np.mean(np.sqrt(np.mean(np.mean((p - t) ** 2, axis = 1), axis = 0)))
    return score

def mcrmse_loss(t, y):
    if t.shape[1] == 107:
        seq_len_target = 68
    elif t.shape[1] == 130:
        seq_len_target = 91
        
    ## calculate mcrmse score by using tf
    t = t[:, :seq_len_target]
    y = y[:, :seq_len_target]
    
    loss = tf.reduce_mean(tf.sqrt(tf.reduce_mean(tf.reduce_mean((t - y) ** 2, axis = 1), axis = 0)))
    return loss

def attention(x_inner, x_outer, n_factor, dropout):
    x_Q =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_inner)
    x_K =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_outer)
    x_V =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_outer)
    x_KT = L.Permute((2, 1))(x_K)
    res = L.Lambda(lambda c: K.batch_dot(c[0], c[1]) / np.sqrt(n_factor))([x_Q, x_KT])
#     res = tf.expand_dims(res, axis = 3)
#     res = L.Conv2D(16, 3, 1, padding = "same", activation = "relu")(res)
#     res = L.Conv2D(1, 3, 1, padding = "same", activation = "relu")(res)
#     res = tf.squeeze(res, axis = 3)
    att = L.Lambda(lambda c: K.softmax(c, axis=-1))(res)
    att = L.Lambda(lambda c: K.batch_dot(c[0], c[1]))([att, x_V])
    return att

def multi_head_attention(x, y, n_factor, n_head, dropout):
    if n_head == 1:
        att = attention(x, y, n_factor, dropout)
    else:
        n_factor_head = n_factor // n_head
        heads = [attention(x, y, n_factor_head, dropout) for i in range(n_head)]
        att = L.Concatenate()(heads)
        att = L.Dense(n_factor, 
                      kernel_initializer='glorot_uniform',
                      bias_initializer='glorot_uniform',
                     )(att)
    x = L.Add()([x, att])
    x = L.LayerNormalization()(x)
    if dropout > 0:
        x = L.Dropout(dropout)(x)
    return x

def res(x, unit, kernel = 3, rate = 0.1):
    h = L.Conv1D(unit, kernel, 1, padding = "same", activation = None)(x)
    h = L.LayerNormalization()(h)
    h = L.LeakyReLU()(h)
    h = L.Dropout(rate)(h)
    return L.Add()([x, h])

def forward(x, unit, kernel = 3, rate = 0.1):
#     h = L.Dense(unit, None)(x)
    h = L.Conv1D(unit, kernel, 1, padding = "same", activation = None)(x)
    h = L.LayerNormalization()(h)
    h = L.Dropout(rate)(h)
#         h = tf.keras.activations.swish(h)
    h = L.LeakyReLU()(h)
    h = res(h, unit, kernel, rate)
    return h

def adj_attn(x, adj, unit, n = 2, rate = 0.1):
    x_a = x
    x_as = []
    for i in range(n):
        x_a = forward(x_a, unit)
        x_a = tf.matmul(adj, x_a) ## aggregate neighborhoods
        x_as.append(x_a)
    if n == 1:
        x_a = x_as[0]
    else:
        x_a = L.Concatenate()(x_as)
    x_a = forward(x_a, unit)
    return x_a


def get_base(config):
    ## base model architecture 
    ## node, adj -> middle feature
    
    node = tf.keras.Input(shape = (None, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (None, None, As.shape[3]), name = "adj")
    
    adj_learned = L.Dense(1, "relu")(adj)
    adj_all = L.Concatenate(axis = 3)([adj, adj_learned])
        
    xs = []
    xs.append(node)
    x1 = forward(node, 128, kernel = 3, rate = 0.0)
    x2 = forward(x1, 64, kernel = 6, rate = 0.0)
    x3 = forward(x2, 32, kernel = 15, rate = 0.0)
    x4 = forward(x3, 16, kernel = 30, rate = 0.0)
    x = L.Concatenate()([x1, x2, x3, x4])
    
    for unit in [64, 32]:
        x_as = []
        for i in range(adj_all.shape[3]):
            x_a = adj_attn(x, adj_all[:, :, :, i], unit, rate = 0.0)
            x_as.append(x_a)
        x_c = forward(x, unit, kernel = 30)
        x_c = forward(x_c, unit*2, kernel = 15)
        x_c = forward(x_c, unit*3, kernel = 6)
        
        x = L.Concatenate()(x_as + [x_c])
        x = forward(x, unit)
        x = multi_head_attention(x, x, unit, 4, 0.0)
        xs.append(x)
        
    x = L.Concatenate()(xs)

    model = tf.keras.Model(inputs = [node, adj], outputs = [x])
    return model


def get_ae_model(base, config):
    ## denoising auto encoder part
    ## node, adj -> middle feature -> node
    
    node = tf.keras.Input(shape = (None, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (None, None, As.shape[3]), name = "adj")

    x = base([L.SpatialDropout1D(0.3)(node), adj])
    x = forward(x, 64, rate = 0.3)
    p = L.Dense(X_node.shape[2], "sigmoid")(x)
    
    loss = - tf.reduce_mean(20 * node * tf.math.log(p + 1e-4) + (1 - node) * tf.math.log(1 - p + 1e-4))
    model = tf.keras.Model(inputs = [node, adj], outputs = [loss])
    
    opt = get_optimizer()
    model.compile(optimizer = opt, loss = lambda t, y : y)
    return model


def get_model(base, config):
    ## regression part
    ## node, adj -> middle feature -> prediction of targets
    
    node = tf.keras.Input(shape = (None, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (None, None, As.shape[3]), name = "adj")
    
    x = base([node, adj])
    x = forward(x, 128, rate = 0.4)
    x = L.Dense(5, None)(x)

    model = tf.keras.Model(inputs = [node, adj], outputs = [x])
    
    opt = get_optimizer()
    model.compile(optimizer = opt, loss = mcrmse_loss)
    return model

def get_optimizer():
#     sgd = tf.keras.optimizers.SGD(0.05, momentum = 0.9, nesterov=True)
    adam = tf.optimizers.Adam()
#     radam = tfa.optimizers.RectifiedAdam()
#     lookahead = tfa.optimizers.Lookahead(adam, sync_period=6)
#     swa = tfa.optimizers.SWA(adam)
    return adam

## pretrain

In [14]:
## here train denoising auto encoder model using all data

config = {} ## not use now
if ae_epochs > 0:
    base = get_base(config)
    ae_model = get_ae_model(base, config)
    ## TODO : simultaneous train
    for i in range(ae_epochs//ae_epochs_each):
        print(f"------ {i} ------")
        print("--- train ---")
        ae_model.fit([X_node, As], [X_node[:,0]],
                  epochs = ae_epochs_each,
                  batch_size = ae_batch_size)
        print("--- train long ---")
        ae_model.fit([X_node_lg, As_lg], [X_node_lg[:,0]],
                  epochs = ae_epochs_each,
                  batch_size = ae_batch_size)
        print("--- public ---")
        ae_model.fit([X_node_pub, As_pub], [X_node_pub[:,0]],
                  epochs = ae_epochs_each,
                  batch_size = ae_batch_size)
        print("--- private ---")
        ae_model.fit([X_node_pri, As_pri], [X_node_pri[:,0]],
                  epochs = ae_epochs_each,
                  batch_size = ae_batch_size)
        gc.collect()
    print("****** save ae model ******")
    base.save_weights("./base_ae")

------ 0 ------
--- train ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- train long ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- public ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- private ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------ 1 ------
--- train ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- train long ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- public ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- private ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------ 2 ------
--- train ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- train long ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- public ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- private ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------ 3 ------
--- train ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- train long ---
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- public 

## train

In [16]:
## here train regression model from pretrain auto encoder model

from sklearn.model_selection import KFold, GroupKFold
# kfold = KFold(5, shuffle = True, random_state = 42)
kfold = GroupKFold(5)

scores = []
id_list = np.empty([len(X_node)], dtype=object)
preds = np.zeros([len(X_node), X_node.shape[1], 5])
y_label = np.zeros([len(X_node), X_node.shape[1], 5])

id_list_lg = np.empty([len(X_node_lg)], dtype=object)
preds_lg = np.zeros([len(X_node_lg), X_node_lg.shape[1], 5])
y_label_lg = np.zeros([len(X_node_lg), X_node_lg.shape[1], 5])

for i, ((tr_idx, va_idx), ((tr_idx_lg, va_idx_lg))) in enumerate(zip(kfold.split(X_node, As, train['id']), 
                                         kfold.split(X_node_lg, As_lg, train_lg['id']))):
    print(f"------ fold {i} start -----")
    print(f"------ fold {i} start -----")
    print(f"------ fold {i} start -----")
    X_node_tr = X_node[tr_idx]
    X_node_va = X_node[va_idx]
    As_tr = As[tr_idx]
    As_va = As[va_idx]
    y_tr = y[tr_idx]
    y_va = y[va_idx]
    
    X_node_tr_lg = X_node_lg[tr_idx_lg]
    X_node_va_lg = X_node_lg[va_idx_lg]
    As_tr_lg = As_lg[tr_idx_lg]
    As_va_lg = As_lg[va_idx_lg]
    y_tr_lg = y_lg[tr_idx_lg]
    y_va_lg = y_lg[va_idx_lg]
    
    base = get_base(config)
    if ae_epochs > 0:
        print("****** load ae model ******")
        base.load_weights("./base_ae")
    model = get_model(base, config)
    if pretrain_dir is not None:
        d = f"./model{i}"
        print(f"--- load from {d} ---")
        model.load_weights(d)
    for epochs, batch_size in zip(epochs_list, batch_size_list):
        print(f"epochs : {epochs}, batch_size : {batch_size}")
        for epoch in range(epochs):
            model.fit([X_node_tr_lg, As_tr_lg], [y_tr_lg],
                      validation_data=([X_node_va_lg, As_va_lg], [y_va_lg]),
                      epochs = 1,
                      batch_size = batch_size, validation_freq = 3)
            
            model.fit([X_node_tr, As_tr], [y_tr],
                      validation_data=([X_node_va, As_va], [y_va]),
                      epochs = 1,
                      batch_size = batch_size, validation_freq = 3)
            
        
#         model.fit([X_node_tr_lg, As_tr_lg], [y_tr_lg],
#                   validation_data=([X_node_va_lg, As_va_lg], [y_va_lg]),
#                   epochs = epochs,
#                   batch_size = batch_size, validation_freq = 3)
        
#         model.fit([X_node_tr, As_tr], [y_tr],
#                   validation_data=([X_node_va, As_va], [y_va]),
#                   epochs = epochs,
#                   batch_size = batch_size, validation_freq = 3)
        
        
        
        
    model.save_weights(f"./model{i}")
    p = model.predict([X_node_va, As_va])
    scores.append(mcrmse(y_va, p))
    p_lg = model.predict([X_node_va_lg, As_va_lg])
    scores.append(mcrmse(y_va_lg, p_lg))
    print(f"fold {i}: mcrmse {scores[-2]} mcrmse pseudo {scores[-1]}")
    
    id_list[va_idx] = train.iloc[va_idx]["id"].tolist()
    preds[va_idx] = p
    y_label[va_idx] = y_va
    
    id_list_lg[va_idx_lg] = train_lg.iloc[va_idx_lg]["id"].tolist()
    preds_lg[va_idx_lg] = p_lg
    y_label_lg[va_idx_lg] = y_va_lg
    if one_fold:
        break
        
# pd.to_pickle(preds, "oof.pkl")

------ fold 0 start -----
------ fold 0 start -----
------ fold 0 start -----
****** load ae model ******
epochs : 30, batch_size : 8
epochs : 10, batch_size : 16
epochs : 5, batch_size : 32
epochs : 5, batch_size : 64
epochs : 8, batch_size : 128
epochs : 8, batch_size : 256
fold 0: mcrmse 0.20758395855145823 mcrmse pseudo 0.08172962266083396
------ fold 1 start -----
------ fold 1 start -----
------ fold 1 start -----
****** load ae model ******
epochs : 30, batch_size : 8
epochs : 10, batch_size : 16
epochs : 5, batch_size : 32


epochs : 5, batch_size : 64
epochs : 8, batch_size : 128
epochs : 8, batch_size : 256
fold 1: mcrmse 0.2098277338683968 mcrmse pseudo 0.07109895841693822
------ fold 2 start -----
------ fold 2 start -----
------ fold 2 start -----
****** load ae model ******
epochs : 30, batch_size : 8
epochs : 10, batch_size : 16
epochs : 5, batch_size : 32
epochs : 5, batch_size : 64
epochs : 8, batch_size : 128
epochs : 8, batch_size : 256
fold 2: mcrmse 0.2049665298547329 mcrmse pseudo 0.0754384404091124
------ fold 3 start -----
------ fold 3 start -----
------ fold 3 start -----
****** load ae model ******
epochs : 30, batch_size : 8


epochs : 10, batch_size : 16
epochs : 5, batch_size : 32
epochs : 5, batch_size : 64
epochs : 8, batch_size : 128
epochs : 8, batch_size : 256
fold 3: mcrmse 0.2144020875978773 mcrmse pseudo 0.07377111090576698
------ fold 4 start -----
------ fold 4 start -----
------ fold 4 start -----
****** load ae model ******
epochs : 30, batch_size : 8
epochs : 10, batch_size : 16
epochs : 5, batch_size : 32
epochs : 5, batch_size : 64
epochs : 8, batch_size : 128
epochs : 8, batch_size : 256
fold 4: mcrmse 0.19943063817603557 mcrmse pseudo 0.08188090880291354


In [26]:
print(scores)
print(mcrmse(preds, y_label))
print(mcrmse(preds_lg, y_label_lg))

[0.20758395855145823, 0.08172962266083396, 0.2098277338683968, 0.07109895841693822, 0.2049665298547329, 0.0754384404091124, 0.2144020875978773, 0.07377111090576698, 0.19943063817603557, 0.08188090880291354]


ValueError: operands could not be broadcast together with shapes (4821,91,5) (6010,91,5) 

In [25]:
# 0.20734392994106435
# 0.07698997502606883

0.07698997502606883


In [18]:
valid_ls = []

for i, uid in enumerate(id_list):
    single_df = pd.DataFrame(preds[i], columns=targets)
    single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
    
    valid_ls.append(single_df)
    
valid_df = pd.concat(valid_ls)

In [19]:
valid_df.to_csv('validation.csv', index=False)

## predict

In [20]:
p_pub = 0
p_pri = 0
for i in range(5):
    model.load_weights(f"./model{i}")
    p_pub += model.predict([X_node_pub, As_pub]) / 5
    p_pri += model.predict([X_node_pri, As_pri]) / 5
    if one_fold:
        p_pub *= 5
        p_pri *= 5
        break

for i, target in enumerate(targets):
    test_pub[target] = [list(p_pub[k, :, i]) for k in range(p_pub.shape[0])]
    test_pri[target] = [list(p_pri[k, :, i]) for k in range(p_pri.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## sub

In [21]:
preds_ls = []
for df, preds in [(test_pub, p_pub), (test_pri, p_pri)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=targets)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.to_csv("submission_all.csv", index = False)
preds_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C,id_seqpos
0,0.767675,0.623866,1.690191,0.528542,0.740489,id_00073f8be_0
1,2.272819,3.316724,3.928512,3.225837,2.627201,id_00073f8be_1
2,1.681761,0.624205,0.679305,0.742571,0.684655,id_00073f8be_2
3,1.29681,1.043877,1.129631,1.630592,1.655349,id_00073f8be_3
4,0.756671,0.54965,0.543975,0.858417,0.813196,id_00073f8be_4


In [22]:
sample_df = sub.copy()

target_cols = [c for c in sample_df.columns if c != 'id_seqpos']

list_id = list(sample_df.id_seqpos.values)
output = {}
output_df = pd.DataFrame({'id_seqpos': sample_df.id_seqpos.values})

for c in target_cols:
    output_values = []
    x = preds_df.groupby('id_seqpos')[c].mean().reset_index()
    print(x.shape)
    output_df = pd.merge(output_df, x, on='id_seqpos')

(457953, 2)
(457953, 2)
(457953, 2)
(457953, 2)
(457953, 2)


In [23]:
output_df.to_csv('submission.csv', index=False)
output_df.shape

(457953, 6)

In [24]:
print(scores)
print(np.mean(scores))

[0.20758395855145823, 0.08172962266083396, 0.2098277338683968, 0.07109895841693822, 0.2049665298547329, 0.0754384404091124, 0.2144020875978773, 0.07377111090576698, 0.19943063817603557, 0.08188090880291354]
0.14201299892440658


In [23]:
# [0.23335501739796655, 0.22334835580793352, 0.23267165843634366, 0.23045706748579886, 0.23826819059727083]
# 0.23162005794506269

# [0.23220652326610938, 0.2219243451974854, 0.23005732488150737, 0.22844312865044877, 0.23698060044541291]
# 0.22992238448819274