## Bert 

In [4]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from bert import tokenization
from bert import bert_tokenization
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model

ModuleNotFoundError: No module named 'tensorflow_hub'

## GPU settings

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')

## Data preprocessing

In [4]:
max_seq_length = 300

In [5]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [6]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [7]:
def make_id(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_ids

def make_mask(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_masks

def make_segment(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_segments

In [8]:
df_extend = pd.read_pickle('input_bert.pkl')

In [9]:
id_seq = 'D00001'
order = 0
orders = []

for ind,row in df_extend.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)

In [10]:
max(orders) # prepare for the one-hot encoding???

24

In [16]:
df_extend['orders'] = orders

In [17]:
df_extend.Label.value_counts()

BACKGROUND                                           11948
METHODS                                              10471
RESULTS                                               7813
OBJECTIVES                                            6396
CONCLUSIONS                                           2650
RESULTS/CONCLUSIONS                                   2020
OBJECTIVES/METHODS                                    1270
METHODS/RESULTS                                       1072
OTHERS                                                 901
BACKGROUND/OBJECTIVES                                  894
OBJECTIVES/RESULTS                                     268
BACKGROUND/METHODS                                     224
METHODS/RESULTS/CONCLUSIONS                            192
OBJECTIVES/METHODS/RESULTS                             125
OBJECTIVES/CONCLUSIONS                                 102
METHODS/CONCLUSIONS                                    100
BACKGROUND/RESULTS                                      

In [18]:
X_all = []

for ind,item in df_extend.id_bert.iteritems():
    X_all.append([df_extend.id_bert[ind],
                  df_extend.mask_bert[ind],
                  df_extend.segment_bert[ind],
                  df_extend.orders[ind]])

In [19]:
# Extract x and y from the dataframe
y_all = df_extend.label_y.values.tolist()

# y: convert it into one-hot encoder
for i in range(len(y_all)):
    y_all[i] =  tf.one_hot(y_all[i],depth=6)

# some y have more than one tensor --> add them together!
y_all_combine = []

for i in range(len(y_all)):
    if y_all[i].shape[0]>1:
        tmp = tf.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0],shape=(1,6))
        for j in range(len(y_all[i])):
            tmp = tmp + y_all[i][j]
        y_all_combine.append(tmp)
    else:
        y_all_combine.append(y_all[i])

In [20]:
for i in range(len(y_all_combine)):
    y_all_combine[i] = tf.reshape(y_all_combine[i],(6,))

In [21]:
len(X_all)

46867

In [22]:
len(y_all_combine)

46867

## Create dataset

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all_combine, test_size=0.3) 

In [24]:
y_train[0].shape

TensorShape([6])

In [25]:
train_input1 = []
train_input2 = []
train_input3 = []
train_input4 = []
for i in range(len(X_train)):
    train_input1.append(X_train[i][0])
    train_input2.append(X_train[i][1])
    train_input3.append(X_train[i][2])
    train_input4.append(X_train[i][3])

val_input1 = []
val_input2 = []
val_input3 = []
val_input4 = []
for j in range(len(X_test)):
    val_input1.append(X_test[j][0])
    val_input2.append(X_test[j][1])
    val_input3.append(X_test[j][2])
    val_input4.append(X_test[j][3])

In [26]:
len(train_input1)

32806

In [27]:
BATCH_SIZE=64

In [28]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_input1, "input_2": train_input2, "input_3": train_input3,"input_4": train_input4},y_train)).shuffle(50000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": val_input1, "input_2": val_input2, "input_3": val_input3,"input_4": val_input4},y_test)).shuffle(50000).batch(BATCH_SIZE)

## Model settings

In [59]:
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [60]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')

In [61]:
@tf.function
def train_step(sentences, labels):
    with tf.GradientTape() as tape:
        #print(sentences['input_4'].shape)
        out = model([sentences['input_1'],
                     sentences['input_2'],
                     sentences['input_3'],
                     tf.reshape(sentences['input_4'],(-1,1))])    
        # Calculate the loss of each class
        loss = loss_object(labels, out)      
        
    train_loss(loss) # Calculate accumulative average loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_accuracy(labels, out)

In [62]:
@tf.function
def val_step(sentences, labels):
    out = model([sentences['input_1'],
                 sentences['input_2'],
                 sentences['input_3'],
                 tf.reshape(sentences['input_4'],(-1,1))])    
    loss = loss_object(labels, out)   
    val_loss(loss)    
    val_accuracy(labels,out)

In [63]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
input_order = tf.keras.layers.Input(shape=(1), dtype=tf.int32, name="orders")


pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
x_order = tf.keras.layers.Dense(1)(input_order)

merge_x = tf.concat([pooled_output, x_order], axis=1)

x = tf.keras.layers.Dropout(0.3)(merge_x)
x = tf.keras.layers.Dense(6, activation='sigmoid')(x)

model = Model(inputs=[input_word_ids, input_mask, segment_ids, input_order], outputs=x)

model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 300)]        0                                            
__________________________________________________________________________________________________
orders (InputLayer)             [(None, 1)]          0                                            
____________________________________________________________________________________________

In [64]:
train_dataset

<BatchDataset shapes: ({input_1: (None, 240), input_2: (None, 240), input_3: (None, 240), input_4: (None,)}, (None, 6)), types: ({input_1: tf.int32, input_2: tf.int32, input_3: tf.int32, input_4: tf.int32}, tf.float32)>

In [None]:
import math

EPOCHS = 5
step = 0
exp = 1

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []
checkpoint_path = "exp/step/ckpt/step-{}.ckpt"

for epoch in range(EPOCHS):
    print('Start Epoch: {}'.format(epoch+1))
    for sentences, labels in train_dataset:       
        train_step(sentences, labels)
        step+=1
    
        train_loss_history.append(train_loss.result())
        train_acc_history.append(train_accuracy.result())

        template = '[Step {:0}], Loss: {:.2f}, Accuracy: {:.2f} '
        print(template.format(step,
                       train_loss.result(),
                       train_accuracy.result()*100))
                                       
        # Reset the metrics for the next step
        train_accuracy.reset_states()
               
        for val_sentences, val_labels in val_dataset:
            val_step(val_sentences, val_labels)

        template = '[Step {:0}], Validation Loss: {:.2f}, Validation Accuracy: {:.2f}'
        print(template.format(step,val_loss.result(),val_accuracy.result()*100))
        print('-----------------------------------------')
        
        val_loss_history.append(val_loss.result())
        val_acc_history.append(val_accuracy.result())
   
    
       # Saving history records to HDD

        np.save('exp/step/history/train_loss.npy',np.asarray(train_loss_history))
        np.save('exp/step/history/val_loss.npy',np.asarray(val_loss_history))

        np.save('exp/step/history/train-acc.npy',np.asarray(train_acc_history))
        np.save('exp/step/history/val-acc.npy',np.asarray(val_acc_history))

    
        # Reset the metrics for the next epoch
        train_loss.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()
        model.save_weights(checkpoint_path.format(step))

Start Epoch: 1
[Step 1], Loss: 0.75, Accuracy: 48.18 
[Step 1], Validation Loss: 0.70, Validation Accuracy: 56.88
-----------------------------------------
[Step 2], Loss: 0.68, Accuracy: 58.33 
[Step 2], Validation Loss: 0.68, Validation Accuracy: 58.54
-----------------------------------------
[Step 3], Loss: 0.69, Accuracy: 56.51 
[Step 3], Validation Loss: 0.66, Validation Accuracy: 62.22
-----------------------------------------
[Step 4], Loss: 0.64, Accuracy: 63.80 
[Step 4], Validation Loss: 0.63, Validation Accuracy: 67.13
-----------------------------------------
[Step 5], Loss: 0.61, Accuracy: 70.83 
[Step 5], Validation Loss: 0.61, Validation Accuracy: 70.84
-----------------------------------------
[Step 6], Loss: 0.60, Accuracy: 72.92 
[Step 6], Validation Loss: 0.59, Validation Accuracy: 73.08
-----------------------------------------
[Step 7], Loss: 0.59, Accuracy: 71.61 
[Step 7], Validation Loss: 0.57, Validation Accuracy: 74.66
----------------------------------------

[Step 59], Loss: 0.44, Accuracy: 82.29 
[Step 59], Validation Loss: 0.39, Validation Accuracy: 83.63
-----------------------------------------
[Step 60], Loss: 0.40, Accuracy: 82.81 
[Step 60], Validation Loss: 0.38, Validation Accuracy: 83.47
-----------------------------------------
[Step 61], Loss: 0.39, Accuracy: 83.59 
[Step 61], Validation Loss: 0.39, Validation Accuracy: 83.21
-----------------------------------------
[Step 62], Loss: 0.38, Accuracy: 85.16 
[Step 62], Validation Loss: 0.39, Validation Accuracy: 83.04
-----------------------------------------
[Step 63], Loss: 0.38, Accuracy: 83.33 
[Step 63], Validation Loss: 0.39, Validation Accuracy: 83.11
-----------------------------------------
[Step 64], Loss: 0.36, Accuracy: 84.38 
[Step 64], Validation Loss: 0.38, Validation Accuracy: 83.31
-----------------------------------------
[Step 65], Loss: 0.38, Accuracy: 85.68 
[Step 65], Validation Loss: 0.38, Validation Accuracy: 83.66
-----------------------------------------

[Step 116], Validation Loss: 0.34, Validation Accuracy: 85.54
-----------------------------------------
[Step 117], Loss: 0.35, Accuracy: 87.24 
[Step 117], Validation Loss: 0.34, Validation Accuracy: 85.65
-----------------------------------------
[Step 118], Loss: 0.31, Accuracy: 87.76 
[Step 118], Validation Loss: 0.34, Validation Accuracy: 85.67
-----------------------------------------
[Step 119], Loss: 0.33, Accuracy: 85.94 
[Step 119], Validation Loss: 0.34, Validation Accuracy: 85.77
-----------------------------------------
[Step 120], Loss: 0.32, Accuracy: 89.58 
[Step 120], Validation Loss: 0.34, Validation Accuracy: 85.75
-----------------------------------------
[Step 121], Loss: 0.35, Accuracy: 85.68 
[Step 121], Validation Loss: 0.34, Validation Accuracy: 85.71
-----------------------------------------
[Step 122], Loss: 0.32, Accuracy: 86.46 
[Step 122], Validation Loss: 0.34, Validation Accuracy: 85.71
-----------------------------------------
[Step 123], Loss: 0.31, Ac

[Step 173], Loss: 0.34, Accuracy: 86.98 
[Step 173], Validation Loss: 0.32, Validation Accuracy: 86.93
-----------------------------------------
[Step 174], Loss: 0.32, Accuracy: 85.68 
[Step 174], Validation Loss: 0.32, Validation Accuracy: 86.93
-----------------------------------------
[Step 175], Loss: 0.31, Accuracy: 86.46 
[Step 175], Validation Loss: 0.32, Validation Accuracy: 86.92
-----------------------------------------
[Step 176], Loss: 0.30, Accuracy: 87.50 
[Step 176], Validation Loss: 0.32, Validation Accuracy: 86.92
-----------------------------------------
[Step 177], Loss: 0.27, Accuracy: 90.89 
[Step 177], Validation Loss: 0.32, Validation Accuracy: 86.95
-----------------------------------------
[Step 178], Loss: 0.32, Accuracy: 87.24 
[Step 178], Validation Loss: 0.32, Validation Accuracy: 86.94
-----------------------------------------
[Step 179], Loss: 0.28, Accuracy: 88.02 
[Step 179], Validation Loss: 0.32, Validation Accuracy: 86.97
---------------------------

[Step 230], Loss: 0.25, Accuracy: 89.84 
[Step 230], Validation Loss: 0.32, Validation Accuracy: 87.09
-----------------------------------------
[Step 231], Loss: 0.34, Accuracy: 86.72 
[Step 231], Validation Loss: 0.32, Validation Accuracy: 87.01
-----------------------------------------
[Step 232], Loss: 0.25, Accuracy: 91.41 
[Step 232], Validation Loss: 0.32, Validation Accuracy: 86.99
-----------------------------------------
[Step 233], Loss: 0.32, Accuracy: 86.46 
[Step 233], Validation Loss: 0.32, Validation Accuracy: 87.03
-----------------------------------------
[Step 234], Loss: 0.29, Accuracy: 87.50 
[Step 234], Validation Loss: 0.32, Validation Accuracy: 87.04
-----------------------------------------
[Step 235], Loss: 0.43, Accuracy: 82.55 
[Step 235], Validation Loss: 0.31, Validation Accuracy: 87.11
-----------------------------------------
[Step 236], Loss: 0.34, Accuracy: 86.46 
[Step 236], Validation Loss: 0.31, Validation Accuracy: 87.28
---------------------------

[Step 287], Loss: 0.32, Accuracy: 86.20 
[Step 287], Validation Loss: 0.31, Validation Accuracy: 87.40
-----------------------------------------
[Step 288], Loss: 0.34, Accuracy: 85.16 
[Step 288], Validation Loss: 0.31, Validation Accuracy: 87.43
-----------------------------------------
[Step 289], Loss: 0.27, Accuracy: 89.06 
[Step 289], Validation Loss: 0.31, Validation Accuracy: 87.48
-----------------------------------------
[Step 290], Loss: 0.30, Accuracy: 87.76 
[Step 290], Validation Loss: 0.31, Validation Accuracy: 87.49
-----------------------------------------
[Step 291], Loss: 0.28, Accuracy: 87.24 
[Step 291], Validation Loss: 0.31, Validation Accuracy: 87.47
-----------------------------------------
[Step 292], Loss: 0.30, Accuracy: 84.38 
[Step 292], Validation Loss: 0.30, Validation Accuracy: 87.47
-----------------------------------------
[Step 293], Loss: 0.32, Accuracy: 87.76 
[Step 293], Validation Loss: 0.30, Validation Accuracy: 87.45
---------------------------

In [300]:
model.load_weights('exp/exp1/ckpt/epoch-3.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc857d74da0>

In [263]:
public_testing_set = pd.read_pickle('test_with_embedding.pkl')

In [264]:
public_testing_set['id_bert'] = public_testing_set.Sentences.apply(lambda x:make_id(x))
public_testing_set['mask_bert'] = public_testing_set.Sentences.apply(lambda x:make_mask(x))
public_testing_set['segment_bert'] = public_testing_set.Sentences.apply(lambda x:make_segment(x))

In [265]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.asarray(x))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.asarray(x))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.asarray(x))

In [266]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))

In [289]:
test_dataset = tf.data.Dataset.from_tensor_slices({'input_word_ids':public_testing_set['id_bert'],
                                                   'input_mask':public_testing_set['mask_bert'],
                                                   'segment_ids':public_testing_set['segment_bert']})

In [301]:
r = model.predict(test_dataset)

In [302]:
THRESHOLD = 0.5

In [303]:
c1 = np.zeros((submission.shape[0]),dtype=np.int32)
c2 = np.zeros((submission.shape[0]),dtype=np.int32)
c3 = np.zeros((submission.shape[0]),dtype=np.int32)
c4 = np.zeros((submission.shape[0]),dtype=np.int32)
c5 = np.zeros((submission.shape[0]),dtype=np.int32)
c6 = np.zeros((submission.shape[0]),dtype=np.int32)

In [304]:
c = [c1,c2,c3,c4,c5,c6]

In [305]:
for i in range(r.shape[0]):
    for j in range(6):
        if r[i][j]>=THRESHOLD:
            c[j][i] = 1

In [306]:
submission=pd.read_csv('dataset/task1_sample_submission.csv') 

In [307]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,0,0,0,0,0,0
1,T00001_S002,0,0,0,0,0,0
2,T00001_S003,0,0,0,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,0,0,0
5,T00001_S006,0,0,0,0,0,0
6,T00001_S007,0,0,0,0,0,0
7,T00002_S001,0,0,0,0,0,0
8,T00002_S002,0,0,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [308]:
submission.BACKGROUND = c1
submission.OBJECTIVES = c2
submission.METHODS = c3
submission.RESULTS = c4
submission.CONCLUSIONS = c5
submission.OTHERS = c6

In [309]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,0,0,0,0,0
2,T00001_S003,0,0,1,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,1,0,0
5,T00001_S006,0,0,0,1,0,0
6,T00001_S007,0,0,0,1,1,0
7,T00002_S001,1,1,0,0,0,0
8,T00002_S002,0,1,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [310]:
submission.to_csv('summit_file.csv',index=False)