In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import csv

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

dat_seen_input = np.array(pd.read_csv('../data/input_prep_w2v_g.csv',sep = ',',header = None))

temp_dat_target = pd.read_csv('../data/target_prep_g.csv', delimiter=",", index_col = 0,header = None, names=['mesh_term_code'])
temp_seen_dat = []
    
for i in range(len(list(temp_dat_target['mesh_term_code']))):
    x = list(temp_dat_target['mesh_term_code'])[i].replace("[","").replace("]","").replace("'","").split(", ")
    temp_seen_dat.append(x)  

In [None]:
mlb_temp = MultiLabelBinarizer()
dat_seen_target = mlb_temp.fit_transform(temp_seen_dat)

In [None]:
weight_seen = 1/(np.sum(dat_seen_target,0)/len(temp_seen_dat))

In [None]:
dat_unseen_input = np.array(pd.read_csv('../data/input_prep_unseen_w2v_g.csv',sep = ',',header = None))

temp_dat_unseen_target = pd.read_csv('../data/target_prep_unseen_g.csv', delimiter=",", index_col = 0,header = None, names=['mesh_term_code'])
unseen_dat = []
for i in range(len(list(temp_dat_unseen_target['mesh_term_code']))):
    x = list(temp_dat_unseen_target['mesh_term_code'])[i].replace("[","").replace("]","").replace("'","").split(", ")
    unseen_dat.append(x)
    
final_unseen = []
final_seen = []
for i in range(len(unseen_dat)):
    x = []
    xx = []
    for j in range(len(unseen_dat[i])):
        if unseen_dat[i][j] not in mlb_temp.classes_:
            x.append(unseen_dat[i][j])
        else:
            xx.append(unseen_dat[i][j])
    final_unseen.append(x)
    final_seen.append(xx)
    
mlb_unseen = MultiLabelBinarizer()
dat_unseen_target = mlb_unseen.fit_transform(final_unseen)

In [None]:
sim = np.array(pd.read_csv('../data/sim_mat.csv',sep = ',',header = None))

In [None]:
emb = np.array(pd.read_csv('../data/vector_glove.csv',sep = ',',header = None))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dat_seen_input, dat_seen_target,
                                                    test_size = 0.20, random_state = 3)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    test_size = 0.20, random_state = 3)

In [None]:
seq_len = 943
num_filter = 100
dim_emb = 300
learning_rate = 0.01

num_seen_outs = len(mlb_temp.classes_)
num_unseen_outs = len(mlb_unseen.classes_)

In [None]:
tf.reset_default_graph()

X = tf.placeholder(tf.int32, [None, seq_len], name="X")
y_ = tf.placeholder(tf.float32, [None, num_seen_outs], name="Y")
y_unseen_ = tf.placeholder(tf.float32, [None, num_unseen_outs], name="Y_unseen")
prob_drop = tf.placeholder(tf.float32, None, name="dropout_prob")
dec_lvl = tf.placeholder(tf.float32, None, name="decision_boundary")

sim_seen2unseen = tf.placeholder(tf.float32,[num_seen_outs, num_unseen_outs], name='sim_matrix')

embeddings = tf.placeholder(tf.float32, [None, dim_emb], name='word2vec')

embedded_dat = tf.nn.embedding_lookup(embeddings, X)

with tf.name_scope("fmodel"):
    conv_3 = tf.layers.conv1d(inputs=embedded_dat, filters=num_filter, kernel_size=3,
                              padding="VALID", strides=1, activation = tf.nn.relu,
                             kernel_initializer= tf.contrib.layers.xavier_initializer())

    pooled_3 = tf.layers.max_pooling1d(conv_3, pool_size = 941, strides = 1)
    
    conv_4 = tf.layers.conv1d(inputs=embedded_dat, filters=num_filter, kernel_size=4,
                              padding="VALID", strides=1, activation = tf.nn.relu,
                             kernel_initializer= tf.contrib.layers.xavier_initializer())

    pooled_4 = tf.layers.max_pooling1d(conv_4, pool_size = 940, strides = 1)
    
    conv_5 = tf.layers.conv1d(inputs=embedded_dat, filters=num_filter, kernel_size=5,
                              padding="VALID", strides=1, activation = tf.nn.relu,
                             kernel_initializer= tf.contrib.layers.xavier_initializer())

    pooled_5 = tf.layers.max_pooling1d(conv_5, pool_size = 939, strides = 1)
    
    pooled_outputs = tf.concat([pooled_3, pooled_4, pooled_5],1)
    flat_layer = tf.reshape(pooled_outputs, [-1, 3 * num_filter])
    
    drop_out = tf.nn.dropout(flat_layer, prob_drop)    
    
    y_drop = tf.layers.dense(inputs=drop_out, units=num_seen_outs, name = 'drop_out')
    yy_drop = tf.nn.sigmoid(y_drop)
    
    
    unseen_dense = np.divide(tf.matmul(y_drop, sim_seen2unseen), num_seen_outs)
    sig_unseen_dense = tf.nn.sigmoid(unseen_dense)

    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = y_, logits = y_drop))
#     loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets = y_, logits = y_drop, 
#                                                                    pos_weight = weight_seen))
    
    opt_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    
    fn = tf.metrics.false_negatives(labels=y_, predictions=tf.greater(yy_drop, dec_lvl))
    fp = tf.metrics.false_positives(labels=y_, predictions=tf.greater(yy_drop, dec_lvl))
    tp = tf.metrics.true_positives(labels=y_, predictions=tf.greater(yy_drop, dec_lvl))
    
    fn_unseen = tf.metrics.false_negatives(labels=y_unseen_, predictions=tf.greater(sig_unseen_dense, dec_lvl))
    fp_unseen = tf.metrics.false_positives(labels=y_unseen_, predictions=tf.greater(sig_unseen_dense, dec_lvl))
    tp_unseen = tf.metrics.true_positives(labels=y_unseen_, predictions=tf.greater(sig_unseen_dense, dec_lvl))

In [None]:
batch_size = 512
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()

saver = tf.train.Saver()

In [None]:
with tf.Session() as sess:
    
    
    sess.run(init)
    sess.run(init_l) 
    cur_f1 = 0
    cnt = 0
    epoch = 0
    
    while (cnt < 4 and epoch < 14):
        
        print('----- Epoch', epoch, '-----')
        for batch in range(len(X_train)//batch_size):

            batch_x = np.reshape(X_train[batch*batch_size:min((batch+1)*batch_size,len(X_train))], [-1, seq_len])
            batch_y = np.reshape(y_train[batch*batch_size:min((batch+1)*batch_size,len(y_train))], [-1, num_seen_outs])

            opt = sess.run(opt_op, feed_dict={X: batch_x, y_: batch_y, embeddings : emb, prob_drop : 0.5})
            loss_ = sess.run(loss, feed_dict={X: batch_x, y_: batch_y, embeddings : emb, prob_drop : 0.5})
            print(loss_)

        print('----- Validation', '-----')
        log_p = []
        log_r = []
        for batch in range(len(X_val)//512):
            batch_val_x = np.reshape(X_val[batch*batch_size:min((batch+1)*batch_size,len(X_val))], [-1, seq_len])
            batch_val_y = np.reshape(y_val[batch*batch_size:min((batch+1)*batch_size,len(y_val))], [-1, num_seen_outs])
                        
            fn_t, fp_t, tp_t = sess.run([fn, fp, tp], feed_dict={X: batch_val_x, 
                                              y_: batch_val_y, embeddings : emb, prob_drop : 0.5, dec_lvl: 0.5})
            p = tp_t[0]/(tp_t[0]+fp_t[0])
            r = tp_t[0]/(tp_t[0]+fn_t[0])

            log_p.append(p)
            log_r.append(r)

        print('tf_precision_recall_f1score:',np.mean(log_p), np.mean(log_r), round(np.mean(log_p) * np.mean(log_r) * 2/(np.mean(log_p) + np.mean(log_r)),4))
        if round(np.mean(log_p) * np.mean(log_r) * 2/(np.mean(log_p) + np.mean(log_r)),4) > cur_f1:
            save_path = saver.save(sess, "../model/glove/model.ckpt")
            print('Save Model Success')
            cur_f1 = round(np.mean(log_p) * np.mean(log_r) * 2/(np.mean(log_p) + np.mean(log_r)),4)
            cnt = 0
        else:
            cnt += 1
        epoch += 1
print("End Training Session")

In [None]:
saver = tf.train.Saver()
with tf.Session() as sess:
    result = []
    target = []
    log_p = []
    log_r = []
    temptemp = []
    sess.run(init_l)
    
    saver.restore(sess, "../model/glove/model.ckpt")
    
    for batch in range(len(X_val)//512):
        batch_val_x = np.reshape(X_val[batch*batch_size:min((batch+1)*batch_size,len(X_val))], [-1, seq_len])
        batch_val_y = np.reshape(y_val[batch*batch_size:min((batch+1)*batch_size,len(y_val))], [-1, num_seen_outs])
                        
        fn_t, fp_t, tp_t = sess.run([fn, fp, tp], feed_dict={X: batch_val_x, 
                                              y_: batch_val_y, embeddings : emb, prob_drop : 0.5, dec_lvl: 0.8})
        p = tp_t[0]/(tp_t[0]+fp_t[0])
        r = tp_t[0]/(tp_t[0]+fn_t[0])

        log_p.append(p)
        log_r.append(r)

    print('tf_precision_recall_f1score:',np.mean(log_p[1:]), np.mean(log_r[1:]), 
          round(np.mean(log_p[1:]) * np.mean(log_r[1:]) * 2/(np.mean(log_p[1:]) + 
                                                             np.mean(log_r[1:])),4))

In [None]:
saver = tf.train.Saver()
with tf.Session() as sess:
    result = []
    target = []
    log_p = []
    log_r = []
    temptemp = []
    sess.run(init_l)
    
    saver.restore(sess,"../model/original_weight/model.ckpt")
    
    for batch in range(len(dat_unseen_input)//batch_size):
        batch_val_x = np.reshape(dat_unseen_input[batch*batch_size:min((batch+1)*batch_size,len(dat_unseen_input))], [-1, seq_len])
        batch_val_y = np.reshape(dat_unseen_target[batch*batch_size:min((batch+1)*batch_size,len(dat_unseen_target))], [-1, num_unseen_outs])

        fn_t, fp_t,  tp_t = sess.run([fn_unseen, fp_unseen, tp_unseen], feed_dict={X: batch_val_x, 
                                          y_unseen_: batch_val_y, embeddings : emb, prob_drop : 0.5,
                                                             dec_lvl: 0.45, sim_seen2unseen: sim})
        p = tp_t[0]/(tp_t[0]+fp_t[0])
        r = tp_t[0]/(tp_t[0]+fn_t[0])

        log_p.append(p)
        log_r.append(r)
        
    print('tf_precision_recall_f1score:',np.mean(log_p[1:]), np.mean(log_r[1:]), round(np.mean(log_p[1:]) * np.mean(log_r[1:]) * 2/(np.mean(log_p[1:]) + np.mean(log_r[1:])),4))