## Bert 

In [3]:
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
import numpy as np
from bert import tokenization
from bert import bert_tokenization
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model

## GPU settings

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [5]:
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

## Data preprocessing

In [6]:
max_seq_length = 300

In [3]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


AttributeError: 'Tensor' object has no attribute 'numpy'

In [8]:
vocab_file

b'/tmp/tfhub_modules/03d6fb3ce1605ad9e5e9ed5346b2fb9623ef4d3d/assets/vocab.txt'

In [9]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [10]:
def make_id(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_ids

def make_mask(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    #input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_masks

def make_segment(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    #input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    #pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

    return input_segments

In [133]:
df_extend['replaced_sentences'] = df_extend.Sentences.apply(lambda x:replace_words(x))

In [134]:
df_extend['id_bert'] = df_extend.replaced_sentences.apply(lambda x:make_id(x))
df_extend['mask_bert'] = df_extend.replaced_sentences.apply(lambda x:make_mask(x))
df_extend['segment_bert'] = df_extend.replaced_sentences.apply(lambda x:make_segment(x))

In [11]:
df_extend = pd.read_pickle('input_bert.pkl')

In [12]:
id_seq = 'D00001'
order = 0
orders = []

for ind,row in df_extend.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)

In [13]:
max(orders) # prepare for the one-hot encoding???

24

In [14]:
df_extend['orders'] = orders

In [794]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_extend.replaced_sentences)

In [795]:
len(vectorizer.get_feature_names())

27177

In [824]:
term = X.toarray()

In [825]:
sum_tf = np.sum(term,axis=0)

In [826]:
sum_tf.shape

(27177,)

In [827]:
np.max(sum_tf)

3066.5393238033394

In [828]:
names = np.asarray(vectorizer.get_feature_names())

In [829]:
names.shape

(27177,)

In [830]:
N = 1500

In [831]:
tops = {}

In [832]:
for i in range(N):
    ind = np.argmax(sum_tf)
    number = np.max(sum_tf)
    
    tops[names[ind]] = number
    
    sum_tf = np.delete(sum_tf,ind,axis=0)
    names = np.delete(names,ind,axis=0)

In [833]:
tops_list = list(tops.keys())

In [834]:
for top in tops_list:
    ans = top in tokenizer.vocab
    if ans == False:
        print('{} --> {} times'.format(top,tops[top]))

markov --> 44.59519837401201 times
decoding --> 43.77039498843173 times
throughput --> 43.030899105701906 times
iterative --> 42.29024989754903 times
validate --> 39.66675726575603 times
encoder --> 38.65493288243426 times
rnn --> 37.9175042300652 times
heterogeneous --> 37.559594303700806 times
scalable --> 36.90972490284447 times
benchmarks --> 36.509035115650725 times
trajectories --> 35.332983583481266 times
deterministic --> 34.90690307694297 times
characterize --> 34.831454320322706 times
decoder --> 34.823558437751196 times
validated --> 34.59437989467476 times
annotated --> 33.77517227935703 times
algorithmic --> 33.01938928291916 times
optimize --> 33.01776231579412 times
optimized --> 32.846703752692 times
formulate --> 32.463415502061814 times
empirically --> 32.118881456065324 times
baselines --> 32.00779768425649 times
theoretic --> 31.370530059815987 times
visualization --> 31.214197828200376 times
github --> 30.491384659114022 times
generalize --> 30.41869935511241 times

In [132]:
import nltk
nltk.download('punkt')

def replace_words(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered = []
    
    for token in tokens:
        token = token.lower()
        if token=='dataset':
            token = 'data'
        if token=='datasets':
            token = 'data'
        if token=='convolutional' or token=='recurrent' or token=='embedding' \
        or token=='embeddings' or token=='unsupervised' or token=='bayesian' \
        or token=='cnns' or token=='lstm' or token=='rnn':
            token = 'technical'
        if token=='segmentation':
            token = 'segment'
        if token=='outperforms' or token=='outperform':
            token = 'better'
        if token == 'variational':
            token = 'variation'
        if token == 'clustering':
            token = 'cluster'
        if token == 'generative':
            token = 'generate'
        if token == 'benchmark':
            token = 'standard'
        if token == 'achieves':
            token = 'achieve'
        if token == 'real-world':
            token = 'world'
        if token == 'adversarial':
            token = 'adversary'
        if token == 'state-of-the-art':
            token = 'newest'
        if token == 'architectures':
            token = 'architecture'
        if token == 'metrics':
            token = 'metric'
        if token == 'stochastic':
            token = 'random'
        if token == 'probabilistic':
            token = 'probability'
        if token == 'latent':
            token = 'hidden'
        if token == 'classifier' or token == 'classifiers':
            token = 'classify'
        if token == 'robustness':
            token = 'robust'
        if token == 'gaussian':            
            token = 'normal'
        if token == 'large-scale':
            token = 'large'
        if token == 'localization':
            token = 'location'
        if token == 'queries':
            token = 'query'
        
        if token == 'iot':
            token = 'internet'
        if token == 'generalization':
            token = 'general'
        if token == 'predictive':
            token = 'predict'
        if token == 'computationally':
            token = 'computational'
        if token == 'visualization':
            token = 'visual'
        if token == 'e.g':
            token = 'example'
        if token == 'i.e':
            token = 'then'
        if token == '\'s':
            token = ''
        
        
        filtered.append(token)
        
    filtered_sentence = ' '.join(filtered)
    return filtered_sentence

[nltk_data] Downloading package punkt to /home/ielab33949/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
nltk.word_tokenize('e.g')

['e.g']

In [875]:
con = {}
for ind,sentence in df_extend.replaced_sentences.iteritems():
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        token = token.lower()
        if token not in tokenizer.vocab:
            if token in list(con.keys()):
                con[token] += 1
            else:
                con[token] = 1

In [876]:
len(con.keys())

26450

In [883]:
for item in con.keys():
    if con[item] >= 150:
        print('{} --> {} times'.format(item,con[item]))

end-to-end --> 237 times
markov --> 185 times
e.g --> 274 times
i.e --> 338 times
real-time --> 197 times
-- --> 373 times
`` --> 736 times
'' --> 723 times
throughput --> 165 times
iterative --> 172 times
trajectories --> 151 times
decoding --> 168 times
heterogeneous --> 154 times


In [852]:
tokenizer.tokenize('danny\'s')

['danny', "'", 's']

In [884]:
'e.g' in tokenizer.vocab

False

In [222]:
'iot'.split('#')

['iot']

In [744]:
not_used_words = {}

In [745]:
not_used_sentences = []

In [746]:
for item in df_extend.replaced_sentences.iteritems():
    tokens = tokenizer.tokenize(str(item))
    for token in tokens:
        if len(token.split('#'))>1:
            if token in not_used_words:
                not_used_words[token] = not_used_words[token] + 1
            else:
                not_used_words[token] = 1

In [747]:
not_used_words

{'##uro': 82,
 '##sc': 42,
 '##ient': 86,
 '##ists': 9,
 '##lev': 24,
 '##el': 95,
 '##channel': 10,
 '##gno': 45,
 '##se': 182,
 '##2': 3302,
 '##e': 658,
 '##mm': 96,
 '##worth': 11,
 '##iness': 18,
 '##s': 8019,
 '##ins': 28,
 '##point': 58,
 '##con': 241,
 '##stra': 74,
 '##ined': 34,
 '##al': 887,
 '##ine': 115,
 '##p': 954,
 '##time': 107,
 '##ret': 192,
 '##ization': 901,
 '##amp': 54,
 '##ling': 179,
 '##ing': 2498,
 '##m': 931,
 '##fer': 163,
 '##li': 69,
 '##fication': 63,
 '##lings': 6,
 '##ers': 214,
 '##a': 640,
 '##ised': 53,
 '##uri': 215,
 '##stic': 426,
 '##bility': 511,
 '##ag': 27,
 '##able': 564,
 '##ance': 89,
 '##r': 1223,
 '##q': 267,
 '##to': 85,
 '##gram': 85,
 '##f': 649,
 '##para': 108,
 '##meter': 75,
 '##nn': 315,
 '##net': 374,
 '##i': 505,
 '##ra': 57,
 '##ca': 52,
 '##ding': 290,
 '##tem': 6,
 '##en': 171,
 '##code': 446,
 '##ano': 9,
 '##bis': 2,
 '##her': 73,
 '##ence': 99,
 '##group': 20,
 '##ps': 191,
 '##mail': 3,
 '##ally': 384,
 '##tas': 42,
 '##k

In [723]:
tokenizer.vocab['internet']

4274

In [724]:
#df_extend.Label.value_counts()

In [18]:
df_extend.replaced_sentences.tolist()

['rapid popularity of internet of things ( iot ) and cloud computing permits neuroscientists to collect multilevel and multichannel brain data to better understand brain functions , diagnose diseases , and devise treatments .',
 'to ensure secure and reliable data communication between end-to-end ( e2e ) devices supported by current iot and cloud infrastructure , trust management is needed at the iot and user ends .',
 'this paper introduces a neuro-fuzzy based brain-inspired trust management model ( tmm ) to secure iot devices and relay nodes , and to ensure data reliability .',
 'the proposed tmm utilizes node behavioral trust and data trust estimated using adaptive neuro-fuzzy inference system and weighted-additive methods respectively to assess the nodes trustworthiness .',
 'in contrast to the existing fuzzy based tmms , the ns2 simulation results confirm the robust and accuracy of the proposed tmm in identifying malicious nodes in the communication network .',
 'with the growing 

In [135]:
X_all = []

for ind,item in df_extend.id_bert.iteritems():
    X_all.append([df_extend.id_bert[ind],
                  df_extend.mask_bert[ind],
                  df_extend.segment_bert[ind],
                  df_extend.orders[ind]])

In [136]:
# Extract x and y from the dataframe
y_all = df_extend.label_y.values.tolist()

# y: convert it into one-hot encoder
for i in range(len(y_all)):
    y_all[i] =  tf.one_hot(y_all[i],depth=6)

# some y have more than one tensor --> add them together!
y_all_combine = []

for i in range(len(y_all)):
    if y_all[i].shape[0]>1:
        tmp = tf.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0],shape=(1,6))
        for j in range(len(y_all[i])):
            tmp = tmp + y_all[i][j]
        y_all_combine.append(tmp)
    else:
        y_all_combine.append(y_all[i])

In [137]:
for i in range(len(y_all_combine)):
    y_all_combine[i] = tf.reshape(y_all_combine[i],(6,))

In [138]:
len(X_all)

46867

In [139]:
len(y_all_combine)

46867

## Create dataset

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all_combine, test_size=0.2) 

In [141]:
y_train[0].shape

TensorShape([6])

In [142]:
train_input1 = []
train_input2 = []
train_input3 = []
train_input4 = []
for i in range(len(X_train)):
    train_input1.append(X_train[i][0])
    train_input2.append(X_train[i][1])
    train_input3.append(X_train[i][2])
    train_input4.append(X_train[i][3])

val_input1 = []
val_input2 = []
val_input3 = []
val_input4 = []
for j in range(len(X_test)):
    val_input1.append(X_test[j][0])
    val_input2.append(X_test[j][1])
    val_input3.append(X_test[j][2])
    val_input4.append(X_test[j][3])

In [143]:
len(train_input1)

37493

In [144]:
BATCH_SIZE=64

In [145]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_input1, "input_2": train_input2, "input_3": train_input3,"input_4": train_input4},y_train)).shuffle(50000).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": val_input1, "input_2": val_input2, "input_3": val_input3,"input_4": val_input4},y_test)).shuffle(50000).batch(BATCH_SIZE)

## Model settings

In [146]:
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [147]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')

In [148]:
@tf.function
def train_step(sentences, labels):
    with tf.GradientTape() as tape:
        #print(sentences['input_4'].shape)
        out = model([sentences['input_1'],
                     sentences['input_2'],
                     sentences['input_3'],
                     tf.reshape(sentences['input_4'],(-1,1))])    
        # Calculate the loss of each class
        loss = loss_object(labels, out)      
        
    train_loss(loss) # Calculate accumulative average loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_accuracy(labels, out)

In [149]:
@tf.function
def val_step(sentences, labels):
    out = model([sentences['input_1'],
                 sentences['input_2'],
                 sentences['input_3'],
                 tf.reshape(sentences['input_4'],(-1,1))])    
    loss = loss_object(labels, out)   
    val_loss(loss)    
    val_accuracy(labels,out)

In [150]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
input_order = tf.keras.layers.Input(shape=(1), dtype=tf.int32, name="orders")


pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
x_order = tf.keras.layers.Dense(1)(input_order)

merge_x = tf.concat([pooled_output, x_order], axis=1)

#x = tf.keras.layers.Dropout(0.3)(merge_x)
x = tf.keras.layers.Dense(6, activation='sigmoid')(merge_x)

model = Model(inputs=[input_word_ids, input_mask, segment_ids, input_order], outputs=x)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 300)]        0                                            
__________________________________________________________________________________________________
orders (InputLayer)             [(None, 1)]          0                                            
____________________________________________________________________________________________

In [151]:
train_dataset

<BatchDataset shapes: ({input_1: (None, 300), input_2: (None, 300), input_3: (None, 300), input_4: (None,)}, (None, 6)), types: ({input_1: tf.int32, input_2: tf.int32, input_3: tf.int32, input_4: tf.int32}, tf.float32)>

In [152]:
# train on whole dataset

import math

EPOCHS = 3
step = 0
exp = 1

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []
checkpoint_path = "exp/exp%d/ckpt/epoch-{}.ckpt"%exp

for epoch in range(EPOCHS):
    for sentences, labels in train_dataset:       
        train_step(sentences, labels)
        step+=1
        
        if step%math.ceil(len(train_input1)/BATCH_SIZE)==0:
            train_loss_history.append(train_loss.result())
            train_acc_history.append(train_accuracy.result())

        
        if step%100==0:
            template = '[Step {:0}], Loss: {:.2f}, Accuracy: {:.2f} '
            print(template.format(step,
                           train_loss.result(),
                           train_accuracy.result()*100))
            
            
                            
        # Reset the metrics for the next step
        train_accuracy.reset_states()
               
    for val_sentences, val_labels in val_dataset:
        val_step(val_sentences, val_labels)

    template = '[Epoch {:0}], Validation Loss: {:.2f}, Validation Accuracy: {:.2f}'
    print(template.format(epoch+1,val_loss.result(),val_accuracy.result()*100))
    print('-----------------------------------------')
        
    val_loss_history.append(val_loss.result())
    val_acc_history.append(val_accuracy.result())
   
    
   # Saving history records to HDD
    train_acc_history_save = np.asarray(train_acc_history)
    val_acc_history_save = np.asarray(val_acc_history)

    np.save('exp/exp%d/history/train_loss.npy'%exp,np.asarray(train_loss_history))
    np.save('exp/exp%d/history/val_loss.npy'%exp,np.asarray(val_loss_history))
    
    np.save('exp/exp%d/history/train-acc-epoch%d.npy'%(exp,epoch+1),train_acc_history_save)
    np.save('exp/exp%d/history/val-acc-epoch%d.npy'%(exp,epoch+1),val_acc_history_save)

    
    # Reset the metrics for the next epoch
    train_loss.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    model.save_weights(checkpoint_path.format(epoch+1))

[Step 100], Loss: 0.36, Accuracy: 89.06 
[Step 200], Loss: 0.33, Accuracy: 88.54 
[Step 300], Loss: 0.31, Accuracy: 89.32 
[Step 400], Loss: 0.30, Accuracy: 85.42 
[Step 500], Loss: 0.30, Accuracy: 89.32 
[Epoch 1], Validation Loss: 0.27, Validation Accuracy: 88.77
-----------------------------------------
[Step 600], Loss: 0.25, Accuracy: 91.41 
[Step 700], Loss: 0.25, Accuracy: 89.58 
[Step 800], Loss: 0.25, Accuracy: 87.24 
[Step 900], Loss: 0.24, Accuracy: 89.32 
[Step 1000], Loss: 0.25, Accuracy: 91.15 
[Step 1100], Loss: 0.25, Accuracy: 91.41 
[Epoch 2], Validation Loss: 0.27, Validation Accuracy: 88.79
-----------------------------------------
[Step 1200], Loss: 0.21, Accuracy: 92.19 
[Step 1300], Loss: 0.20, Accuracy: 91.41 
[Step 1400], Loss: 0.20, Accuracy: 92.45 
[Step 1500], Loss: 0.20, Accuracy: 90.10 
[Step 1600], Loss: 0.20, Accuracy: 89.84 
[Step 1700], Loss: 0.20, Accuracy: 94.53 
[Epoch 3], Validation Loss: 0.29, Validation Accuracy: 88.49
----------------------------

In [153]:
model.load_weights('exp/exp1/ckpt/epoch-1.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa1601c64e0>

### Private testing set

In [43]:
# convert private set into bert input type

private_csv = pd.read_csv('task1_private_testset.csv')

columns = ['Id','Title','Sentences','Authors',
           'Categories','Created Date']
private_test = pd.DataFrame(columns=columns)

for index, row in private_csv.iterrows():
    sentences = row['Abstract'].split('$$$')
    #labels = row['Task 1'].split(' ')
    
    if index%500==0:
        print(index)
        
    for i in range(len(sentences)):
        s = pd.Series({'Id':row['Id'], 'Title':row['Title'],'Sentences':sentences[i],
                      'Authors':row['Authors'] , 'Categories':row['Categories'],
                      'Created Date':row['Created Date']})
        private_test = private_test.append(s, ignore_index=True)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500


In [55]:
private_test.head(10)

Unnamed: 0,Id,Title,Sentences,Authors,Categories,Created Date,replaced_sentences,id_bert,mask_bert,segment_bert,orders
0,T20001,"Smart ""Predict, then Optimize""",Many real-world analytics problems involve two...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,many world analytics problems involve two sign...,"[101, 2116, 2088, 25095, 3471, 9125, 2048, 327...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[1]
1,T20001,"Smart ""Predict, then Optimize""",Due to the typically complex nature of each ch...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,due to the typically complex nature of each ch...,"[101, 2349, 2000, 1996, 4050, 3375, 3267, 1997...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[2]
2,T20001,"Smart ""Predict, then Optimize""","By and large, machine learning tools are inten...",Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,"by and large , machine learning tools are inte...","[101, 2011, 1998, 2312, 1010, 3698, 4083, 5906...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[3]
3,T20001,"Smart ""Predict, then Optimize""","In contrast, we propose a new and very general...",Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,"in contrast , we propose a new and very genera...","[101, 1999, 5688, 1010, 2057, 16599, 1037, 204...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[4]
4,T20001,"Smart ""Predict, then Optimize""",A key component of our framework is the SPO lo...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,a key component of our framework is the spo lo...,"[101, 1037, 3145, 6922, 1997, 2256, 7705, 2003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[5]
5,T20001,"Smart ""Predict, then Optimize""",Training a model with respect to the SPO loss ...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,training a model with respect to the spo loss ...,"[101, 2731, 1037, 2944, 2007, 4847, 2000, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[6]
6,T20001,"Smart ""Predict, then Optimize""",We also propose a stochastic gradient descent ...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,we also propose a random gradient descent algo...,"[101, 2057, 2036, 16599, 1037, 6721, 17978, 69...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[7]
7,T20001,"Smart ""Predict, then Optimize""","Finally, we perform computational experiments ...",Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,"finally , we perform computational experiments...","[101, 2633, 1010, 2057, 4685, 15078, 7885, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8]
8,T20002,On the variable hierarchy of first-order spectra,The spectrum of a first-order logic sentence i...,Kopczynski/Tan,cs.LO/cs.CC,2014-03-10,the spectrum of a first-order logic sentence i...,"[101, 1996, 8674, 1997, 1037, 2034, 1011, 2344...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[1]
9,T20002,On the variable hierarchy of first-order spectra,In this paper we study the hierarchy of first-...,Kopczynski/Tan,cs.LO/cs.CC,2014-03-10,in this paper we study the hierarchy of first-...,"[101, 1999, 2023, 3259, 2057, 2817, 1996, 1257...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[2]


In [154]:
private_test['replaced_sentences'] = private_test.Sentences.apply(lambda x:replace_words(x))

In [155]:
private_test['id_bert'] = private_test.replaced_sentences.apply(lambda x:make_id(x))
private_test['mask_bert'] = private_test.replaced_sentences.apply(lambda x:make_mask(x))
private_test['segment_bert'] = private_test.replaced_sentences.apply(lambda x:make_segment(x))

In [156]:
private_test['id_bert'] = private_test.id_bert.apply(lambda x:np.asarray(x))
private_test['mask_bert'] = private_test.mask_bert.apply(lambda x:np.asarray(x))
private_test['segment_bert'] = private_test.segment_bert.apply(lambda x:np.asarray(x))

In [157]:
private_test['id_bert'] = private_test.id_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
private_test['mask_bert'] = private_test.mask_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
private_test['segment_bert'] = private_test.segment_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))

In [50]:
id_seq = 'T20001'
order = 0
orders = []

for ind,row in private_test.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)

In [51]:
private_test['orders'] = orders

In [52]:
private_test['orders'] = private_test.orders.apply(lambda x:np.reshape(x,(1,)))

In [158]:
private_test_dataset = tf.data.Dataset.from_tensor_slices({'input_word_ids':private_test['id_bert'],
                                                   'input_mask':private_test['mask_bert'],
                                                   'segment_ids':private_test['segment_bert'],
                                                   'orders':private_test['orders']})

In [159]:
r = model.predict(private_test_dataset)

### Public testing set

In [160]:
public_testing_set = pd.read_pickle('test_with_embedding.pkl')

In [161]:
public_testing_set['replaced_sentences'] = public_testing_set.Sentences.apply(lambda x:replace_words(x))

In [162]:
public_testing_set['id_bert'] = public_testing_set.replaced_sentences.apply(lambda x:make_id(x))
public_testing_set['mask_bert'] = public_testing_set.replaced_sentences.apply(lambda x:make_mask(x))
public_testing_set['segment_bert'] = public_testing_set.replaced_sentences.apply(lambda x:make_segment(x))

In [163]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.asarray(x))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.asarray(x))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.asarray(x))

In [164]:
public_testing_set['id_bert'] = public_testing_set.id_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['mask_bert'] = public_testing_set.mask_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))
public_testing_set['segment_bert'] = public_testing_set.segment_bert.apply(lambda x:np.reshape(x,(1,max_seq_length)))

In [165]:
id_seq = 'T00001'
order = 0
orders = []

for ind,row in public_testing_set.iterrows():
    if row.Id == id_seq:
        order += 1
    else:
        id_seq = row.Id
        order = 1  
    orders.append(order)
    
public_testing_set['orders'] = orders

In [166]:
public_testing_set['orders'] = public_testing_set.orders.apply(lambda x:np.reshape(x,(1,)))

In [167]:
test_dataset = tf.data.Dataset.from_tensor_slices({'input_word_ids':public_testing_set['id_bert'],
                                                   'input_mask':public_testing_set['mask_bert'],
                                                   'segment_ids':public_testing_set['segment_bert'],
                                                   'orders':public_testing_set['orders']})

In [168]:
r_public = model.predict(test_dataset)

In [169]:
r_public.shape

(131166, 6)

In [170]:
r.shape

(131782, 6)

In [171]:
final = np.concatenate((r_public, r), axis=0)

In [172]:
final.shape

(262948, 6)

### Output results

In [185]:
THRESHOLD = 0.4

In [186]:
empty = np.zeros((submission.shape[0]),dtype=np.int32)
c1 = np.zeros((submission.shape[0]),dtype=np.int32)
c2 = np.zeros((submission.shape[0]),dtype=np.int32)
c3 = np.zeros((submission.shape[0]),dtype=np.int32)
c4 = np.zeros((submission.shape[0]),dtype=np.int32)
c5 = np.zeros((submission.shape[0]),dtype=np.int32)
c6 = np.zeros((submission.shape[0]),dtype=np.int32)

In [187]:
c = [c1,c2,c3,c4,c5,c6]

In [188]:
np.argmax(final[5])

3

In [189]:
count = 0

In [190]:
for i in range(final.shape[0]):
    for j in range(6):
        if final[i][j]>=THRESHOLD:
            c[j][i] = 1
            
    if c[0][i]==0&c[1][i]==0&c[2][i]==0&c[3][i]==0&c[4][i]==0&c[5][i]==0:
        count += 1
        c[np.argmax(final[i])][i] = 1

In [191]:
count

194227

In [180]:
submission=pd.read_csv('dataset/task1_sample_submission.csv') 

In [181]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,0,0,0,0,0,0
1,T00001_S002,0,0,0,0,0,0
2,T00001_S003,0,0,0,0,0,0
3,T00001_S004,0,0,0,0,0,0
4,T00001_S005,0,0,0,0,0,0
5,T00001_S006,0,0,0,0,0,0
6,T00001_S007,0,0,0,0,0,0
7,T00002_S001,0,0,0,0,0,0
8,T00002_S002,0,0,0,0,0,0
9,T00002_S003,0,0,0,0,0,0


In [182]:
submission.BACKGROUND = c1
submission.OBJECTIVES = c2
submission.METHODS = c3
submission.RESULTS = c4
submission.CONCLUSIONS = c5
submission.OTHERS = c6

In [183]:
submission.head(10)

Unnamed: 0,order_id,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001_S001,1,0,0,0,0,0
1,T00001_S002,1,0,0,0,0,0
2,T00001_S003,0,1,1,0,0,0
3,T00001_S004,0,1,1,0,0,0
4,T00001_S005,0,0,1,1,0,0
5,T00001_S006,0,0,1,1,0,0
6,T00001_S007,0,0,0,1,1,0
7,T00002_S001,1,0,0,0,0,0
8,T00002_S002,0,1,0,0,0,0
9,T00002_S003,0,0,1,0,0,0


In [184]:
submission.to_csv('summit_file.csv',index=False)