## Bert 

In [48]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from bert import tokenization
from bert import bert_tokenization
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model

## GPU settings

In [49]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [50]:
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')

## Data preprocessing

In [51]:
max_seq_length = 300

In [52]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [53]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [54]:
def make_id(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_ids

def make_mask(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_masks

def make_segment(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_segments

In [11]:
df_extend = pd.read_pickle('input_bert.pkl')

In [347]:
id_seq = 'D00001'
order = 0
orders = []

for ind,row in df_extend.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)

In [353]:
max(orders) # prepare for the one-hot encoding???

24

In [405]:
df_extend['orders'] = orders

In [351]:
df_extend.head(10)

Unnamed: 0,Id,Title,Sentences,Authors,Categories,Created Date,Label,encode_x,label_y,id_bert,mask_bert,segment_bert,order,orders
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,BACKGROUND,"[67018, 50647, 57206, 39455, 1390, 27420, 4691...",[0],"[101, 5915, 6217, 1997, 4274, 1997, 2477, 1006...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,1
1,D00001,A Brain-Inspired Trust Management Model to Ass...,To ensure secure and reliable data communicati...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,OBJECTIVES,"[76338, 100433, 97478, 27420, 57689, 49296, 11...",[1],"[101, 2000, 5676, 5851, 1998, 10539, 2951, 480...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,2
2,D00001,A Brain-Inspired Trust Management Model to Ass...,This paper introduces a Neuro-Fuzzy based Brai...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,METHODS,"[70484, 40636, 88204, 100425, 2583, 40562, 108...",[2],"[101, 2023, 3259, 13999, 1037, 11265, 10976, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,3
3,D00001,A Brain-Inspired Trust Management Model to Ass...,The proposed TMM utilizes node behavioral trus...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,METHODS,"[22910, 55237, 16568, 69221, 61489, 64999, 144...",[2],"[101, 1996, 3818, 1056, 7382, 21852, 13045, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,4
4,D00001,A Brain-Inspired Trust Management Model to Ass...,"In contrast to the existing fuzzy based TMMs, ...",Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,RESULTS,"[92177, 73437, 12088, 82732, 5232, 84607, 4056...",[3],"[101, 1999, 5688, 2000, 1996, 4493, 18001, 224...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,5
5,D00001,A Brain-Inspired Trust Management Model to Ass...,With the growing usage of cloud based IoT fram...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,CONCLUSIONS,"[41541, 82732, 5858, 54546, 57206, 69711, 4056...",[4],"[101, 2007, 1996, 3652, 8192, 1997, 6112, 2241...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,6
6,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OBJECTIVES,"[92177, 14492, 40636, 62985, 8539, 82732, 6855...",[1],"[101, 1999, 2023, 3259, 1010, 2057, 4769, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,1
7,D00002,On Efficient Computation of Shortest Dubins Pa...,Given initial and final configurations of the ...,Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OTHERS,"[95902, 38868, 27420, 85089, 80733, 57206, 827...",[5],"[101, 2445, 3988, 1998, 2345, 22354, 1997, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,2
8,D00002,On Efficient Computation of Shortest Dubins Pa...,We provide a novel geometrical analysis of the...,Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,METHODS/RESULTS,"[104212, 80674, 100425, 57683, 78849, 75061, 5...","[2, 3]","[101, 2057, 3073, 1037, 3117, 14965, 2389, 410...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,3
9,D00002,On Efficient Computation of Shortest Dubins Pa...,We then show how our method can be used to qui...,Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,RESULTS,"[104212, 36294, 35912, 101740, 73071, 68424, 1...",[3],"[101, 2057, 2059, 2265, 2129, 2256, 4118, 2064...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,4


In [216]:
df_extend.Label.value_counts()

BACKGROUND                                           11948
METHODS                                              10471
RESULTS                                               7813
OBJECTIVES                                            6396
CONCLUSIONS                                           2650
RESULTS/CONCLUSIONS                                   2020
OBJECTIVES/METHODS                                    1270
METHODS/RESULTS                                       1072
OTHERS                                                 901
BACKGROUND/OBJECTIVES                                  894
OBJECTIVES/RESULTS                                     268
BACKGROUND/METHODS                                     224
METHODS/RESULTS/CONCLUSIONS                            192
OBJECTIVES/METHODS/RESULTS                             125
OBJECTIVES/CONCLUSIONS                                 102
METHODS/CONCLUSIONS                                    100
BACKGROUND/RESULTS                                      

In [386]:
df_extend['orders'] = df_extend.orders.apply(lambda x:np.asarray(x))

In [387]:
df_extend['orders'] = df_extend.orders.apply(lambda x:np.reshape(x,(1,1)))

In [406]:
X_all = []

for ind,item in df_extend.id_bert.iteritems():
    X_all.append([df_extend.id_bert[ind],
                  df_extend.mask_bert[ind],
                  df_extend.segment_bert[ind],
                  df_extend.orders[ind]])

In [407]:
# Extract x and y from the dataframe
y_all = df_extend.label_y.values.tolist()

# y: convert it into one-hot encoder
for i in range(len(y_all)):
    y_all[i] =  tf.one_hot(y_all[i],depth=6)

# some y have more than one tensor --> add them together!
y_all_combine = []

for i in range(len(y_all)):
    if y_all[i].shape[0]>1:
        tmp = tf.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0],shape=(1,6))
        for j in range(len(y_all[i])):
            tmp = tmp + y_all[i][j]
        y_all_combine.append(tmp)
    else:
        y_all_combine.append(y_all[i])

In [408]:
for i in range(len(y_all_combine)):
    y_all_combine[i] = tf.reshape(y_all_combine[i],(6,))

In [409]:
len(X_all)

46867

In [410]:
len(y_all_combine)

46867

## Create dataset

In [411]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all_combine, test_size=0.3) 

In [412]:
y_train[0].shape

TensorShape([6])

In [413]:
train_input1 = []
train_input2 = []
train_input3 = []
train_input4 = []
for i in range(len(X_train)):
    train_input1.append(X_train[i][0])
    train_input2.append(X_train[i][1])
    train_input3.append(X_train[i][2])
    train_input4.append(X_train[i][3])

val_input1 = []
val_input2 = []
val_input3 = []
val_input4 = []
for j in range(len(X_test)):
    val_input1.append(X_test[j][0])
    val_input2.append(X_test[j][1])
    val_input3.append(X_test[j][2])
    val_input4.append(X_test[j][3])

In [414]:
len(train_input1)

32806

In [415]:
BATCH_SIZE=64

In [416]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_input1, "input_2": train_input2, "input_3": train_input3,"input_4": train_input4},y_train)).shuffle(50000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": val_input1, "input_2": val_input2, "input_3": val_input3,"input_4": val_input4},y_test)).shuffle(50000).batch(BATCH_SIZE)

## Model settings

In [444]:
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [445]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')

In [446]:
@tf.function
def train_step(sentences, labels):
    with tf.GradientTape() as tape:
        #print(sentences['input_4'].shape)
        out = model([sentences['input_1'],
                     sentences['input_2'],
                     sentences['input_3'],
                     tf.reshape(sentences['input_4'],(-1,1))])    
        # Calculate the loss of each class
        loss = loss_object(labels, out)      
        
    train_loss(loss) # Calculate accumulative average loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_accuracy(labels, out)

In [447]:
@tf.function
def val_step(sentences, labels):
    out = model([sentences['input_1'],
                 sentences['input_2'],
                 sentences['input_3'],
                 tf.reshape(sentences['input_4'],(-1,1))])    
    loss = loss_object(labels, out)   
    val_loss(loss)    
    val_accuracy(labels,out)

In [448]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
input_order = tf.keras.layers.Input(shape=(1), dtype=tf.int32, name="orders")


pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
x_order = tf.keras.layers.Dense(1)(input_order)

merge_x = tf.concat([pooled_output, x_order], axis=1)

x = tf.keras.layers.Dropout(0.3)(merge_x)
x = tf.keras.layers.Dense(6, activation='sigmoid')(x)

model = Model(inputs=[input_word_ids, input_mask, segment_ids, input_order], outputs=x)

model.summary()

Model: "model_26"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 300)]        0                                            
__________________________________________________________________________________________________
orders (InputLayer)             [(None, 1)]          0                                            
___________________________________________________________________________________________

In [449]:
train_dataset

<BatchDataset shapes: ({input_1: (None, 240), input_2: (None, 240), input_3: (None, 240), input_4: (None,)}, (None, 6)), types: ({input_1: tf.int32, input_2: tf.int32, input_3: tf.int32, input_4: tf.int32}, tf.float32)>

In [None]:
import math

EPOCHS = 5
step = 0
exp = 1

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []
checkpoint_path = "exp/exp%d/ckpt/epoch-{}.ckpt"%exp

for epoch in range(EPOCHS):
    for sentences, labels in train_dataset:       
        train_step(sentences, labels)
        step+=1
        
        if step%math.ceil(len(train_input1)/BATCH_SIZE)==0:
            train_loss_history.append(train_loss.result())
            train_acc_history.append(train_accuracy.result())

        
        if step%100==0:
            template = '[Step {:0}], Loss: {:.2f}, Accuracy: {:.2f} '
            print(template.format(step,
                           train_loss.result(),
                           train_accuracy.result()*100))
            
            
                            
        # Reset the metrics for the next step
        train_accuracy.reset_states()
               
    for val_sentences, val_labels in val_dataset:
        val_step(val_sentences, val_labels)

    template = '[Epoch {:0}], Validation Loss: {:.2f}, Validation Accuracy: {:.2f}'
    print(template.format(epoch+1,val_loss.result(),val_accuracy.result()*100))
    print('\n********************************')
        
    val_loss_history.append(val_loss.result())
    val_acc_history.append(val_accuracy.result())
   
    
   # Saving history records to HDD
    train_acc_history_save = np.asarray(train_acc_history)
    val_acc_history_save = np.asarray(val_acc_history)

    np.save('exp/exp%d/history/train_loss.npy'%exp,np.asarray(train_loss_history))
    np.save('exp/exp%d/history/val_loss.npy'%exp,np.asarray(val_loss_history))
    
    np.save('exp/exp%d/history/train-acc-epoch%d.npy'%(exp,epoch+1),train_acc_history_save)
    np.save('exp/exp%d/history/val-acc-epoch%d.npy'%(exp,epoch+1),val_acc_history_save)

    
    # Reset the metrics for the next epoch
    train_loss.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    model.save_weights(checkpoint_path.format(epoch+1))

[Step 100], Loss: 0.17, Accuracy: 100.00 
[Step 200], Loss: 0.11, Accuracy: 99.22 
[Step 300], Loss: 0.08, Accuracy: 100.00 
[Step 400], Loss: 0.07, Accuracy: 99.74 
[Step 500], Loss: 0.06, Accuracy: 99.74 
[Epoch 1], Validation Loss: 0.03, Validation Accuracy: 99.33

********************************


In [300]:
model.load_weights('exp/exp1/ckpt/epoch-3.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc857d74da0>

In [263]:
public_testing_set = pd.read_pickle('test_with_embedding.pkl')

In [264]:
public_testing_set['id_bert'] = public_testing_set.Sentences.apply(lambda x:make_id(x))
public_testing_set['mask_bert'] = public_testing_set.Sentences.apply(lambda x:make_mask(x))
public_testing_set['segment_bert'] = public_testing_set.Sentences.apply(lambda x:make_segment(x))

In [265]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.asarray(x))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.asarray(x))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.asarray(x))

In [266]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))

In [289]:
test_dataset = tf.data.Dataset.from_tensor_slices({'input_word_ids':public_testing_set['id_bert'],
                                                   'input_mask':public_testing_set['mask_bert'],
                                                   'segment_ids':public_testing_set['segment_bert']})

In [301]:
r = model.predict(test_dataset)

In [302]:
THRESHOLD = 0.5

In [303]:
c1 = np.zeros((submission.shape[0]),dtype=np.int32)
c2 = np.zeros((submission.shape[0]),dtype=np.int32)
c3 = np.zeros((submission.shape[0]),dtype=np.int32)
c4 = np.zeros((submission.shape[0]),dtype=np.int32)
c5 = np.zeros((submission.shape[0]),dtype=np.int32)
c6 = np.zeros((submission.shape[0]),dtype=np.int32)

In [304]:
c = [c1,c2,c3,c4,c5,c6]

In [305]:
for i in range(r.shape[0]):
    for j in range(6):
        if r[i][j]>=THRESHOLD:
            c[j][i] = 1

In [306]:
submission=pd.read_csv('dataset/task1_sample_submission.csv') 

In [307]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,0,0,0,0,0,0
1,T00001_S002,0,0,0,0,0,0
2,T00001_S003,0,0,0,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,0,0,0
5,T00001_S006,0,0,0,0,0,0
6,T00001_S007,0,0,0,0,0,0
7,T00002_S001,0,0,0,0,0,0
8,T00002_S002,0,0,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [308]:
submission.BACKGROUND = c1
submission.OBJECTIVES = c2
submission.METHODS = c3
submission.RESULTS = c4
submission.CONCLUSIONS = c5
submission.OTHERS = c6

In [309]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,0,0,0,0,0
2,T00001_S003,0,0,1,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,1,0,0
5,T00001_S006,0,0,0,1,0,0
6,T00001_S007,0,0,0,1,1,0
7,T00002_S001,1,1,0,0,0,0
8,T00002_S002,0,1,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [310]:
submission.to_csv('summit_file.csv',index=False)