## Bert 

In [1]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from bert import tokenization
from bert import bert_tokenization
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model

## GPU settings

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

## Data preprocessing

In [4]:
max_seq_length = 300

In [5]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [190]:
vocab_file

b'/tmp/tfhub_modules/03d6fb3ce1605ad9e5e9ed5346b2fb9623ef4d3d/assets/vocab.txt'

In [6]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [7]:
def make_id(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_ids

def make_mask(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_masks

def make_segment(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_segments

In [8]:
df_extend = pd.read_pickle('input_bert.pkl')

In [9]:
id_seq = 'D00001'
order = 0
orders = []

for ind,row in df_extend.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)

In [10]:
max(orders) # prepare for the one-hot encoding???

24

In [11]:
df_extend['orders'] = orders

In [242]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_extend.Sentences)

In [257]:
len(vectorizer.get_feature_names())

27187

In [245]:
tf = X.toarray()

In [349]:
sum_tf = np.sum(tf,axis=0)

In [350]:
sum_tf.shape

(27187,)

In [351]:
np.max(sum_tf)

64323

In [352]:
names = np.asarray(vectorizer.get_feature_names())

In [353]:
names.shape

(27187,)

In [354]:
N = 1000

In [355]:
tops = {}

In [356]:
for i in range(N):
    ind = np.argmax(sum_tf)
    number = np.max(sum_tf)
    
    tops[names[ind]] = number
    
    sum_tf = np.delete(sum_tf,ind,axis=0)
    names = np.delete(names,ind,axis=0)

In [357]:
tops_list = list(tops.keys())

In [358]:
for top in tops_list:
    ans = top in tokenizer.vocab
    if ans == False:
        print('{} --> {} times'.format(top,tops[top]))

dataset --> 1024 times
datasets --> 1009 times
convolutional --> 639 times
segmentation --> 544 times
outperforms --> 413 times
clustering --> 406 times
achieves --> 366 times
benchmark --> 360 times
adversarial --> 331 times
generative --> 331 times
architectures --> 310 times
metrics --> 298 times
embedding --> 297 times
recurrent --> 294 times
stochastic --> 283 times
probabilistic --> 262 times
latent --> 261 times
unsupervised --> 261 times
classifier --> 232 times
robustness --> 232 times
gaussian --> 228 times
queries --> 219 times
localization --> 213 times
bayesian --> 207 times
cnns --> 201 times
iot --> 200 times
markov --> 198 times
generalization --> 193 times
embeddings --> 192 times
throughput --> 191 times
classifiers --> 183 times
lstm --> 183 times
decoding --> 179 times
iterative --> 175 times
predictive --> 171 times
computationally --> 169 times
encoder --> 169 times


In [240]:
tokenizer.tokenize('I love to eat america')

['i', 'love', 'to', 'eat', 'america']

In [359]:
'rnn' in tokenizer.vocab

False

In [222]:
'iot'.split('#')

['iot']

In [235]:
not_used_words = {}

In [236]:
not_used_sentences = []

In [237]:
for item in df_extend.iteritems():
    tokens = tokenizer.tokenize(str(item[1]))
    for token in tokens:
        if len(token.split('#'))>1:
            if token in not_used_words:
                not_used_words[token] = not_used_words[token] + 1
            else:
                not_used_words[token] = 1

In [238]:
not_used_words

{'##00': 15,
 '##1': 13,
 '##86': 78,
 '##2': 17,
 '##0': 9,
 '##70': 5,
 '##3': 19,
 '##4': 19,
 '##5': 15,
 '##6': 30,
 '##7': 39,
 '##ype': 13,
 '##ym': 7,
 '##pt': 6,
 '##otic': 6,
 '##t': 6,
 '##mun': 6,
 '##ica': 1,
 '##ti': 1,
 '##uro': 1,
 '##i': 1,
 '##mm': 2,
 '##us': 1,
 '##s': 1,
 '##metric': 1,
 '##ati': 1,
 '##o': 1,
 '##bu': 5,
 '##abe': 5,
 '##day': 5,
 '##ama': 5,
 '##ashi': 5,
 '##18': 1,
 '##64': 2,
 '##20': 6,
 '##45': 3,
 '##9': 19,
 '##33': 2,
 '##8': 16,
 '##43': 2,
 '##47': 1,
 '##29': 2,
 '##48': 2,
 '##36': 1,
 '##42': 1,
 '##10': 2,
 '##23': 2,
 '##22': 1,
 '##17': 2,
 '##32': 9,
 '##60': 2,
 '##35': 2,
 '##85': 2,
 '##40': 1,
 '##13': 2,
 '##53': 1,
 '##21': 1,
 '##25': 1,
 '##code': 1,
 '##15': 1,
 '##51': 1,
 '##39': 1,
 '##65': 1,
 '##52': 1,
 '##57': 1,
 '##26': 1,
 '##80': 1}

In [203]:
tokenizer.vocab['internet']

4274

In [212]:
for item in df_extend.Sentences.iteritems():
    print(type(item[1]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [13]:
df_extend.Label.value_counts()

BACKGROUND                                           11948
METHODS                                              10471
RESULTS                                               7813
OBJECTIVES                                            6396
CONCLUSIONS                                           2650
RESULTS/CONCLUSIONS                                   2020
OBJECTIVES/METHODS                                    1270
METHODS/RESULTS                                       1072
OTHERS                                                 901
BACKGROUND/OBJECTIVES                                  894
OBJECTIVES/RESULTS                                     268
BACKGROUND/METHODS                                     224
METHODS/RESULTS/CONCLUSIONS                            192
OBJECTIVES/METHODS/RESULTS                             125
OBJECTIVES/CONCLUSIONS                                 102
METHODS/CONCLUSIONS                                    100
BACKGROUND/RESULTS                                      

In [14]:
X_all = []

for ind,item in df_extend.id_bert.iteritems():
    X_all.append([df_extend.id_bert[ind],
                  df_extend.mask_bert[ind],
                  df_extend.segment_bert[ind],
                  df_extend.orders[ind]])

In [15]:
# Extract x and y from the dataframe
y_all = df_extend.label_y.values.tolist()

# y: convert it into one-hot encoder
for i in range(len(y_all)):
    y_all[i] =  tf.one_hot(y_all[i],depth=6)

# some y have more than one tensor --> add them together!
y_all_combine = []

for i in range(len(y_all)):
    if y_all[i].shape[0]>1:
        tmp = tf.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0],shape=(1,6))
        for j in range(len(y_all[i])):
            tmp = tmp + y_all[i][j]
        y_all_combine.append(tmp)
    else:
        y_all_combine.append(y_all[i])

In [16]:
for i in range(len(y_all_combine)):
    y_all_combine[i] = tf.reshape(y_all_combine[i],(6,))

In [17]:
len(X_all)

46867

In [18]:
len(y_all_combine)

46867

## Create dataset

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all_combine, test_size=0.2) 

In [20]:
y_train[0].shape

TensorShape([6])

In [21]:
train_input1 = []
train_input2 = []
train_input3 = []
train_input4 = []
for i in range(len(X_train)):
    train_input1.append(X_train[i][0])
    train_input2.append(X_train[i][1])
    train_input3.append(X_train[i][2])
    train_input4.append(X_train[i][3])

val_input1 = []
val_input2 = []
val_input3 = []
val_input4 = []
for j in range(len(X_test)):
    val_input1.append(X_test[j][0])
    val_input2.append(X_test[j][1])
    val_input3.append(X_test[j][2])
    val_input4.append(X_test[j][3])

In [22]:
len(train_input1)

37493

In [23]:
BATCH_SIZE=64

In [24]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_input1, "input_2": train_input2, "input_3": train_input3,"input_4": train_input4},y_train)).shuffle(50000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": val_input1, "input_2": val_input2, "input_3": val_input3,"input_4": val_input4},y_test)).shuffle(50000).batch(BATCH_SIZE)

## Model settings

In [158]:
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [159]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')

In [160]:
@tf.function
def train_step(sentences, labels):
    with tf.GradientTape() as tape:
        #print(sentences['input_4'].shape)
        out = model([sentences['input_1'],
                     sentences['input_2'],
                     sentences['input_3'],
                     tf.reshape(sentences['input_4'],(-1,1))])    
        # Calculate the loss of each class
        loss = loss_object(labels, out)      
        
    train_loss(loss) # Calculate accumulative average loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_accuracy(labels, out)

In [161]:
@tf.function
def val_step(sentences, labels):
    out = model([sentences['input_1'],
                 sentences['input_2'],
                 sentences['input_3'],
                 tf.reshape(sentences['input_4'],(-1,1))])    
    loss = loss_object(labels, out)   
    val_loss(loss)    
    val_accuracy(labels,out)

In [29]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
input_order = tf.keras.layers.Input(shape=(1), dtype=tf.int32, name="orders")


pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
x_order = tf.keras.layers.Dense(1)(input_order)

merge_x = tf.concat([pooled_output, x_order], axis=1)

#x = tf.keras.layers.Dropout(0.3)(merge_x)
x = tf.keras.layers.Dense(6, activation='sigmoid')(merge_x)

model = Model(inputs=[input_word_ids, input_mask, segment_ids, input_order], outputs=x)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 300)]        0                                            
__________________________________________________________________________________________________
orders (InputLayer)             [(None, 1)]          0                                            
______________________________________________________________________________________________

In [30]:
train_dataset

<BatchDataset shapes: ({input_1: (None, 240), input_2: (None, 240), input_3: (None, 240), input_4: (None,)}, (None, 6)), types: ({input_1: tf.int32, input_2: tf.int32, input_3: tf.int32, input_4: tf.int32}, tf.float32)>

In [31]:
# train on whole dataset

import math

EPOCHS = 3
step = 0
exp = 1

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []
checkpoint_path = "exp/exp%d/ckpt/epoch-{}.ckpt"%exp

for epoch in range(EPOCHS):
    for sentences, labels in train_dataset:       
        train_step(sentences, labels)
        step+=1
        
        if step%math.ceil(len(train_input1)/BATCH_SIZE)==0:
            train_loss_history.append(train_loss.result())
            train_acc_history.append(train_accuracy.result())

        
        if step%100==0:
            template = '[Step {:0}], Loss: {:.2f}, Accuracy: {:.2f} '
            print(template.format(step,
                           train_loss.result(),
                           train_accuracy.result()*100))
            
            
                            
        # Reset the metrics for the next step
        train_accuracy.reset_states()
               
    for val_sentences, val_labels in val_dataset:
        val_step(val_sentences, val_labels)

    template = '[Epoch {:0}], Validation Loss: {:.2f}, Validation Accuracy: {:.2f}'
    print(template.format(epoch+1,val_loss.result(),val_accuracy.result()*100))
    print('-----------------------------------------')
        
    val_loss_history.append(val_loss.result())
    val_acc_history.append(val_accuracy.result())
   
    
   # Saving history records to HDD
    train_acc_history_save = np.asarray(train_acc_history)
    val_acc_history_save = np.asarray(val_acc_history)

    np.save('exp/exp%d/history/train_loss.npy'%exp,np.asarray(train_loss_history))
    np.save('exp/exp%d/history/val_loss.npy'%exp,np.asarray(val_loss_history))
    
    np.save('exp/exp%d/history/train-acc-epoch%d.npy'%(exp,epoch+1),train_acc_history_save)
    np.save('exp/exp%d/history/val-acc-epoch%d.npy'%(exp,epoch+1),val_acc_history_save)

    
    # Reset the metrics for the next epoch
    train_loss.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    model.save_weights(checkpoint_path.format(epoch+1))

[Step 100], Loss: 0.41, Accuracy: 87.76 
[Step 200], Loss: 0.37, Accuracy: 86.46 
[Step 300], Loss: 0.35, Accuracy: 88.28 
[Step 400], Loss: 0.34, Accuracy: 88.80 
[Step 500], Loss: 0.33, Accuracy: 90.10 




[Epoch 1], Validation Loss: 0.29, Validation Accuracy: 87.74
-----------------------------------------
[Step 600], Loss: 0.28, Accuracy: 89.32 
[Step 700], Loss: 0.27, Accuracy: 88.28 
[Step 800], Loss: 0.27, Accuracy: 85.42 
[Step 900], Loss: 0.27, Accuracy: 89.06 
[Step 1000], Loss: 0.27, Accuracy: 88.28 
[Step 1100], Loss: 0.27, Accuracy: 89.32 
[Epoch 2], Validation Loss: 0.29, Validation Accuracy: 88.00
-----------------------------------------
[Step 1200], Loss: 0.23, Accuracy: 94.27 
[Step 1300], Loss: 0.22, Accuracy: 92.97 
[Step 1400], Loss: 0.23, Accuracy: 89.32 
[Step 1500], Loss: 0.23, Accuracy: 93.23 
[Step 1600], Loss: 0.23, Accuracy: 89.58 
[Step 1700], Loss: 0.23, Accuracy: 88.54 
[Epoch 3], Validation Loss: 0.30, Validation Accuracy: 87.66
-----------------------------------------


In [176]:
# train on small subset

import math

EPOCHS = 5
step = 0
exp = 1

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []
checkpoint_path = "exp/exp%d/ckpt/epoch-{}.ckpt"%exp

for epoch in range(EPOCHS):
    for sentences, labels in train_dataset.shuffle(50000).take(5):       
        train_step(sentences, labels)
        step+=1
        
        if step%math.ceil(len(train_input1)/BATCH_SIZE)==0:
            train_loss_history.append(train_loss.result())
            train_acc_history.append(train_accuracy.result())

        
        if step%1==0:
            template = '[Step {:0}], Loss: {:.2f}, Accuracy: {:.2f} '
            print(template.format(step,
                           train_loss.result(),
                           train_accuracy.result()*100))
            
            
                            
        # Reset the metrics for the next step
        train_accuracy.reset_states()
               
    for val_sentences, val_labels in val_dataset:
        val_step(val_sentences, val_labels)

    template = '[Epoch {:0}], Validation Loss: {:.2f}, Validation Accuracy: {:.2f}'
    print(template.format(epoch+1,val_loss.result(),val_accuracy.result()*100))
    print('-----------------------------------------')
        
    val_loss_history.append(val_loss.result())
    val_acc_history.append(val_accuracy.result())
   
    
   # Saving history records to HDD
    train_acc_history_save = np.asarray(train_acc_history)
    val_acc_history_save = np.asarray(val_acc_history)

    np.save('exp/exp%d/history/train_loss.npy'%exp,np.asarray(train_loss_history))
    np.save('exp/exp%d/history/val_loss.npy'%exp,np.asarray(val_loss_history))
    
    np.save('exp/exp%d/history/train-acc-epoch%d.npy'%(exp,epoch+1),train_acc_history_save)
    np.save('exp/exp%d/history/val-acc-epoch%d.npy'%(exp,epoch+1),val_acc_history_save)

    
    # Reset the metrics for the next epoch
    train_loss.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    #model.save_weights(checkpoint_path.format(epoch+1))

[Step 1], Loss: 0.29, Accuracy: 86.98 
[Step 2], Loss: 0.28, Accuracy: 89.32 
[Step 3], Loss: 0.28, Accuracy: 88.54 
[Step 4], Loss: 0.28, Accuracy: 87.76 
[Step 5], Loss: 0.28, Accuracy: 89.32 
[Epoch 1], Validation Loss: 0.30, Validation Accuracy: 87.70
-----------------------------------------
[Step 6], Loss: 0.23, Accuracy: 91.93 
[Step 7], Loss: 0.25, Accuracy: 88.54 
[Step 8], Loss: 0.25, Accuracy: 88.28 
[Step 9], Loss: 0.26, Accuracy: 88.54 
[Step 10], Loss: 0.28, Accuracy: 86.98 
[Epoch 2], Validation Loss: 0.29, Validation Accuracy: 87.69
-----------------------------------------
[Step 11], Loss: 0.28, Accuracy: 90.62 
[Step 12], Loss: 0.30, Accuracy: 86.20 
[Step 13], Loss: 0.30, Accuracy: 88.02 
[Step 14], Loss: 0.28, Accuracy: 93.23 
[Step 15], Loss: 0.29, Accuracy: 85.68 
[Epoch 3], Validation Loss: 0.29, Validation Accuracy: 87.76
-----------------------------------------
[Step 16], Loss: 0.28, Accuracy: 89.06 
[Step 17], Loss: 0.26, Accuracy: 90.62 
[Step 18], Loss: 0.2

In [167]:
model.load_weights('exp/exp1/ckpt/epoch-1.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f257e030be0>

In [33]:
public_testing_set = pd.read_pickle('test_with_embedding.pkl')

In [34]:
public_testing_set['id_bert'] = public_testing_set.Sentences.apply(lambda x:make_id(x))
public_testing_set['mask_bert'] = public_testing_set.Sentences.apply(lambda x:make_mask(x))
public_testing_set['segment_bert'] = public_testing_set.Sentences.apply(lambda x:make_segment(x))

In [35]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.asarray(x))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.asarray(x))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.asarray(x))

In [36]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))

In [49]:
public_testing_set['orders'] = public_testing_set.orders.apply(lambda x:np.reshape(x,(1,)))

In [40]:
id_seq = 'T00001'
order = 0
orders = []

for ind,row in public_testing_set.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)
    
public_testing_set['orders'] = orders

In [50]:
test_dataset = tf.data.Dataset.from_tensor_slices({'input_word_ids':public_testing_set['id_bert'],
                                                   'input_mask':public_testing_set['mask_bert'],
                                                   'segment_ids':public_testing_set['segment_bert'],
                                                   'orders':public_testing_set['orders']})

In [51]:
r = model.predict(test_dataset)

In [147]:
THRESHOLD = 0.3

In [136]:
empty = np.zeros((submission.shape[0]),dtype=np.int32)
c1 = np.zeros((submission.shape[0]),dtype=np.int32)
c2 = np.zeros((submission.shape[0]),dtype=np.int32)
c3 = np.zeros((submission.shape[0]),dtype=np.int32)
c4 = np.zeros((submission.shape[0]),dtype=np.int32)
c5 = np.zeros((submission.shape[0]),dtype=np.int32)
c6 = np.zeros((submission.shape[0]),dtype=np.int32)

In [137]:
c = [c1,c2,c3,c4,c5,c6]

In [138]:
np.argmax(r[5])

3

In [139]:
count = 0

In [140]:
for i in range(r.shape[0]):
    for j in range(6):
        if r[i][j]>=THRESHOLD:
            c[j][i] = 1
            
    if c[0][i]==0&c[1][i]==0&c[2][i]==0&c[3][i]==0&c[4][i]==0&c[5][i]==0:
        count += 1
        c[np.argmax(r[i])][i] = 1

In [141]:
count

83283

In [142]:
submission=pd.read_csv('dataset/task1_sample_submission.csv') 

In [143]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,0,0,0,0,0,0
1,T00001_S002,0,0,0,0,0,0
2,T00001_S003,0,0,0,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,0,0,0
5,T00001_S006,0,0,0,0,0,0
6,T00001_S007,0,0,0,0,0,0
7,T00002_S001,0,0,0,0,0,0
8,T00002_S002,0,0,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [144]:
submission.BACKGROUND = c1
submission.OBJECTIVES = c2
submission.METHODS = c3
submission.RESULTS = c4
submission.CONCLUSIONS = c5
submission.OTHERS = c6

In [145]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,1,0,0,0,0
2,T00001_S003,0,1,1,0,0,0
3,T00001_S004,0,1,1,0,0,0
4,T00001_S005,0,0,1,1,0,0
5,T00001_S006,0,0,1,1,0,0
6,T00001_S007,0,0,0,1,1,0
7,T00002_S001,1,0,0,0,0,0
8,T00002_S002,1,1,0,0,0,0
9,T00002_S003,1,0,1,0,0,0


In [146]:
submission.to_csv('summit_file.csv',index=False)