In [1]:
import os
import spacy
from spacy.matcher import PhraseMatcher
import json
from tqdm import tqdm
from spacy_conll import init_parser
import re

In [2]:
with open('../data/entity_corpus/pwc_entities.json') as f:
    entity_type_map = json.load(f)

hp_names = ['number of layers', 'number of units', 'activation function', 'L2 regularization', 'epoch number',
            'number of timesteps', 'lrdecay', 'Trainingepochs', 'history size', 'buffer size', 'buffer length',
            'history length', 'context size', 'context length', 'optimizer', 'attention size', 'attention layers',
            'L1 regularization', 'beta', 'alpha', 'learning rate', 'padding size', 'hidden layer size',
            'hidden dimension size', 'embedding dimension', 'word embedding dimension', 'word embedding size',
            'number of epochs', 'minibatch size', 'mini - batch size' , 'size of minibatch', 'number of samples', 'distance metric',
            'learning rate decay rate', 'decay rate', 'weight decay rate', 'batch size', 'momentum term', 'early stopping criterion',
            'θ', 'gamma', 'γ', 'α', 'β', 'δ','τ' , 'number of batches', 'number of iterations',
            'attention function', 'activaton gate', 'classification threshold', 'number of parameters', 'parameter dimension',
            'number of parameters per layer', 'step size', 'step length', 'epsilon', 'ε', 'eps',
            'max depth', 'maximum depth', 'kernel size', 'kernel dimension', 'number of estimators', 'number of workers']

entity_type_map['HyperparameterName'] = hp_names

nlp = spacy.load("en_core_web_sm")
case_insensitive_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
case_sensitive_matcher = PhraseMatcher(nlp.vocab)

common_vocab = set(nlp.vocab.strings)

entity_type_map_cased = {}
entity_type_map_uncased = {}

for entity_type in entity_type_map:

    entity_type_map_cased[entity_type] = []
    entity_type_map_uncased[entity_type] = []

    for term in entity_type_map[entity_type]:

        if term.lower() in common_vocab:
            entity_type_map_cased[entity_type].append(term)
        else:
            entity_type_map_uncased[entity_type].append(term)

for entity_type in entity_type_map_uncased:
    patterns = [nlp.make_doc(term) for term in entity_type_map_uncased[entity_type]]
    case_insensitive_matcher.add(entity_type, patterns)

for entity_type in entity_type_map_cased:
    patterns = [nlp.make_doc(term) for term in entity_type_map_cased[entity_type]]
    case_sensitive_matcher.add(entity_type, patterns)

In [3]:
test_sub_data = []

path_to_test_data = '../data/test_submission_dataset/'
filename = 'anlp-sciner-test.txt'

with open(os.path.join(path_to_test_data, filename)) as f:
    for line in f:
        test_sub_data.append(line.strip())

In [5]:
predictions = []

for sentence in test_sub_data:
    doc = nlp(sentence)
    matches = case_insensitive_matcher(doc, as_spans=True) + case_sensitive_matcher(doc, as_spans=True)
    matches = [(span.start, span.end, span.label_, span.text) for span in spacy.util.filter_spans(matches)]
    
    split_sentence = sentence.split(" ")
    labels = ['O' for _ in split_sentence]
    
    contains_hp_metric = False
    
    for start, end, tag, _ in matches:
        
        if tag in ['HyperparameterName', 'MetricName']:
            contains_hp_metric = True
        
        labels[start] = f'B-{tag}'
        
        for i in range(start+1, end):
            labels[i] = f'I-{tag}'
    
    if contains_hp_metric:
        
        for idx, token in enumerate(split_sentence):
            
            is_number = False
            
            try:
                float(re.sub('[-kMm,%]', '', token))
                is_number = True
            except ValueError:
                is_number = False
            
            if is_number and labels[idx]=='O':
                
                dx = 1
                
                left_consider = True
                right_consider = True
                
                if idx+dx<len(labels) and idx-dx>-1:
                    if split_sentence[idx-dx] in ['(', ','] and split_sentence[idx+dx]==')':
                        right_consider = False
                        left_consider = False
                        
                    if split_sentence[idx-dx].lower() in ['table', 'section', 'figure', 'appendix', 'paper']:
                        right_consider = False
                        left_consider = False
                
                while idx-dx>-1 or idx+dx<len(labels):
                    
                    if idx-dx>-1 and left_consider:
                        if 'HyperparameterName' in labels[idx-dx]:
                            labels[idx] = 'B-HyperparameterValue'
                        
                        if 'MetricName' in labels[idx-dx]:
                            labels[idx] = 'B-MetricValue'
                            
                        if split_sentence[idx-dx]=='.':
                            left_consider = False
                            
                    elif idx+dx<len(labels) and right_consider:
                        if 'HyperparameterName' in labels[idx+dx]:
                            labels[idx] = 'B-HyperparameterValue'
                        
                        if 'MetricName' in labels[idx+dx]:
                            labels[idx] = 'B-MetricValue'
                            
                        if split_sentence[idx+dx]=='.':
                            right_consider = False
                    
                    if not (left_consider or right_consider):
                        break
                            
                    
                    dx += 1
                
                if labels[idx]!='O' and idx+1<len(labels) and labels[idx+1]=='O' and split_sentence[idx+1]=='%':
                    labels[idx+1] = 'I-' + labels[idx][2:]
                    
     
    predictions.append({'text': sentence, 'entities': labels})
    
    if contains_hp_metric:
        for idx, word in enumerate(split_sentence):
            print(word, '\t', labels[idx])

The 	 O
micro 	 O
F1 	 B-MetricName
- 	 I-MetricName
score 	 I-MetricName
achieved 	 O
by 	 O
the 	 O
fastText 	 B-MethodName
classifier 	 O
significantly 	 O
exceeds 	 O
that 	 O
of 	 O
the 	 O
majority 	 O
class 	 O
baseline 	 O
, 	 O
confirming 	 O
the 	 O
findings 	 O
of 	 O
Romanov 	 O
and 	 O
Shivade 	 O
( 	 O
2018 	 O
) 	 O
, 	 O
who 	 O
report 	 O
a 	 O
micro 	 B-MetricName
- 	 I-MetricName
F1 	 I-MetricName
score 	 O
of 	 O
61.9 	 B-MetricValue
but 	 O
do 	 O
not 	 O
identify 	 O
or 	 O
analyze 	 O
artifacts 	 O
: 	 O
To 	 O
mitigate 	 O
the 	 O
effect 	 O
of 	 O
clinical 	 O
annotation 	 O
artifacts 	 O
, 	 O
we 	 O
employ 	 O
AFLite 	 O
, 	 O
an 	 O
adversarial 	 O
filtering 	 O
algorithm 	 O
introduced 	 O
by 	 O
Sakaguchi 	 O
et 	 O
AFLite 	 O
requires 	 O
distributed 	 O
representations 	 O
of 	 O
the 	 O
full 	 O
dataset 	 O
as 	 O
input 	 O
, 	 O
and 	 O
proceeds 	 O
in 	 O
an 	 O
iterative 	 O
fashion 	 O
. 	 O
At 	 O
each 	 O
iteration 	 O
, 	 O
an 	 O
ensemble 	 O
of

To 	 O
train 	 O
both 	 O
our 	 O
LM 	 O
- 	 O
KT 	 O
knowledge 	 B-TaskName
tracing 	 I-TaskName
model 	 O
and 	 O
our 	 O
question 	 B-TaskName
generation 	 I-TaskName
model 	 O
, 	 O
we 	 O
use 	 O
the 	 O
pre 	 O
- 	 O
trained 	 O
OpenAI 	 O
GPT-2 	 B-MethodName
model 	 O
from 	 O
the 	 O
HuggingFace 	 O
Transformers 	 O
library 	 O
( 	 O
Wolf 	 O
et 	 O
al 	 O
. 	 O
, 	 O
2020 	 O
) 	 O
. 	 O
For 	 O
question 	 B-TaskName
generation 	 I-TaskName
, 	 O
we 	 O
modify 	 O
the 	 O
library 	 O
to 	 O
add 	 O
a 	 O
linear 	 B-MethodName
layer 	 I-MethodName
and 	 O
the 	 O
modified 	 O
loss 	 B-MetricName
function 	 O
for 	 O
question 	 B-TaskName
generation 	 I-TaskName
from 	 O
Section 	 O
3 	 O
. 	 O
We 	 O
use 	 O
1 	 O
NVIDIA 	 O
TitanXP 	 O
GPU 	 O
with 	 O
12 	 O
GB 	 O
of 	 O
memory 	 O
available 	 O
. 	 O
Because 	 O
the 	 O
maximum 	 O
input 	 O
sequence 	 O
length 	 O
of 	 O
the 	 O
GPT-2 	 B-MethodName
model 	 O
we 	 O
use 	 O
is 	 O
1024 	 B-HyperparameterValue
tokens 	 O
,

MQM 	 O
Score 	 B-MetricName
= 	 O
1 	 B-MetricValue
− 	 O
n 	 O
min 	 O
+ 	 O
5n 	 O
maj 	 O
+ 	 O
10n 	 O
cri 	 O
n 	 O
We 	 O
follow 	 O
the 	 O
evaluation 	 O
method 	 O
of 	 O
the 	 O
WMT 	 O
QE 	 O
tasks 	 O
: 	 O
Pearson 	 O
's 	 O
r 	 O
correlation 	 O
as 	 O
the 	 O
main 	 O
metric 	 O
( 	 O
Graham 	 O
, 	 O
2015 	 O
) 	 O
, 	 O
Mean 	 O
- 	 O
Absolute 	 O
Error 	 B-MetricName
( 	 O
MAE 	 B-MetricName
) 	 O
and 	 O
Root 	 O
- 	 O
Mean 	 O
- 	 O
Squared 	 O
Error 	 B-MetricName
( 	 O
RMSE 	 B-MetricName
) 	 O
as 	 O
secondary 	 O
metrics 	 O
. 	 O
For 	 O
statistical 	 O
significance 	 O
on 	 O
Pearson 	 O
's 	 O
r 	 O
, 	 O
we 	 O
compute 	 O
Williams 	 O
test 	 O
( 	 O
Williams 	 O
, 	 O
1959 	 O
) 	 O
as 	 O
suggested 	 O
by 	 O
Graham 	 O
and 	 O
Baldwin 	 O
( 	 O
2014 	 O
) 	 O
. 	 O
We 	 O
here 	 O
present 	 O
the 	 O
results 	 O
of 	 O
our 	 O
inter 	 O
- 	 O
annotator 	 O
agreement 	 O
study 	 O
, 	 O
which 	 O
we 	 O
perform 	 O
in 	 O
order 	 O
to 	 O
estimate 	 O
the

Experiment 	 O
slot 	 B-TaskName
filling 	 I-TaskName
. 	 O
Table 	 O
7 	 O
shows 	 O
the 	 O
macro 	 O
- 	 O
average 	 B-MetricName
F1 	 I-MetricName
scores 	 O
for 	 O
our 	 O
different 	 O
models 	 O
on 	 O
the 	 O
slot 	 O
identification 	 O
task 	 O
. 	 O
10 	 O
As 	 O
for 	 O
entity 	 B-TaskName
typing 	 I-TaskName
, 	 O
we 	 O
train 	 O
and 	 O
evaluate 	 O
our 	 O
model 	 O
on 	 O
the 	 O
subset 	 O
of 	 O
sentences 	 O
marked 	 O
as 	 O
experiment 	 O
- 	 O
describing 	 O
, 	 O
which 	 O
contain 	 O
4,263 	 O
slot 	 O
instances 	 O
. 	 O
Again 	 O
, 	 O
the 	 O
CRF 	 B-MethodName
baseline 	 O
outperforms 	 O
the 	 O
BiLSTM 	 B-MethodName
when 	 O
using 	 O
only 	 O
mat2vec 	 O
and/or 	 O
word2vec 	 O
embeddings 	 O
. 	 O
The 	 O
addition 	 O
of 	 O
BERT 	 B-MethodName
or 	 O
SciBERT 	 O
embeddings 	 O
improves 	 O
performance 	 O
. 	 O
However 	 O
, 	 O
on 	 O
this 	 O
task 	 O
, 	 O
the 	 O
BiLSTM 	 B-MethodName
model 	 O
with 	 O
( 	 O
Sci)BERT 	 O
embeddings 	 O
outperforms

C.1 	 O
Linearity 	 O
with 	 O
Varying 	 O
Context 	 B-HyperparameterName
Size 	 I-HyperparameterName
Shown 	 O
in 	 O
Figure 	 O
5 	 O
, 	 O
we 	 O
compare 	 O
the 	 O
negative 	 O
loglikelihood 	 O
of 	 O
sentences 	 O
when 	 O
conditioned 	 O
on 	 O
varying 	 O
history 	 O
sizes 	 O
( 	 O
using 	 O
the 	 O
story 	 O
summary 	 O
as 	 O
context 	 O
E 	 O
) 	 O
. 	 O
As 	 O
expected 	 O
, 	 O
conditioning 	 O
on 	 O
longer 	 O
histories 	 O
increases 	 O
the 	 O
predictability 	 O
of 	 O
a 	 O
sentence 	 O
. 	 O
However 	 O
, 	 O
this 	 O
effect 	 O
is 	 O
significantly 	 O
larger 	 O
for 	 O
imagined 	 O
stories 	 O
, 	 O
which 	 O
suggests 	 O
that 	 O
imagined 	 O
stories 	 O
flow 	 O
more 	 O
linearly 	 O
than 	 O
recalled 	 O
stories 	 O
. 	 O
Background 	 O
: 	 O
Contrastive 	 B-MethodName
Learning 	 I-MethodName
Instance 	 O
discrimination 	 O
- 	 O
based 	 O
contrastive 	 B-MethodName
learning 	 I-MethodName
aims 	 O
to 	 O
bring 	 O
two 	 O
views 	 O
of 	 O
the 	 O
same 	 O
so

Step 	 O
1 	 O
: 	 O
Preliminary 	 O
annotations 	 O
To 	 O
ensure 	 O
the 	 O
feasibility 	 O
of 	 O
creating 	 O
a 	 O
dataset 	 O
for 	 O
this 	 O
task 	 O
, 	 O
two 	 O
experts 	 O
( 	 O
a 	 O
post 	 O
- 	 O
doctoral 	 O
researcher 	 O
and 	 O
an 	 O
undergraduate 	 O
student 	 O
with 	 O
NLP 	 O
background 	 O
) 	 O
independently 	 O
annotate 	 O
800 	 O
random 	 O
samples 	 O
( 	 O
from 	 O
four 	 O
topics 	 O
, 	 O
200 	 B-HyperparameterValue
per 	 O
topic 	 O
) 	 O
taken 	 O
from 	 O
the 	 O
UKP 	 B-DatasetName
- 	 O
Corpus 	 O
. 	 O
The 	 O
annotations 	 O
are 	 O
binary 	 O
and 	 O
on 	 O
token 	 O
- 	 O
level 	 O
, 	 O
where 	 O
multiple 	 O
spans 	 O
of 	 O
tokens 	 O
could 	 O
be 	 O
selected 	 O
as 	 O
aspects 	 O
. 	 O
The 	 O
resulting 	 O
inter 	 O
- 	 O
annotator 	 O
agreement 	 O
of 	 O
this 	 O
study 	 O
is 	 O
Krippendorff 	 O
's 	 O
α 	 B-HyperparameterName
u 	 O
= 	 O
.38 	 B-HyperparameterValue
. 	 O
While 	 O
this 	 O
shows 	 O
that 	 O
the 	 O
task 	 O
is 	 O


All 	 O
arguments 	 O
of 	 O
the 	 O
training 	 O
documents 	 O
are 	 O
tokenized 	 O
with 	 O
a 	 O
BPE 	 B-MethodName
model 	 O
( 	 O
Sennrich 	 O
et 	 O
al 	 O
. 	 O
, 	 O
2016 	 O
) 	 O
trained 	 O
by 	 O
the 	 O
authors 	 O
of 	 O
the 	 O
CTRL 	 B-MethodName
( 	 O
Keskar 	 O
et 	 O
al 	 O
. 	 O
, 	 O
2019 	 O
) 	 O
. 	 O
Both 	 O
the 	 O
Arg 	 O
- 	 O
CTRL 	 B-MethodName
CC 	 O
and 	 O
the 	 O
Arg 	 O
- 	 O
CTRL 	 B-MethodName
REDDIT 	 B-DatasetName
are 	 O
fine 	 O
- 	 O
tuned 	 O
on 	 O
a 	 O
Tesla 	 O
V100 	 O
with 	 O
32 	 O
GB 	 O
of 	 O
Memory 	 O
. 	 O
We 	 O
mainly 	 O
keep 	 O
the 	 O
default 	 O
hyperparameters 	 O
but 	 O
reduce 	 O
the 	 O
batch 	 B-HyperparameterName
size 	 I-HyperparameterName
to 	 O
4 	 B-HyperparameterValue
and 	 O
train 	 O
both 	 O
models 	 O
for 	 O
1 	 B-HyperparameterValue
epoch 	 O
. 	 O
Each 	 O
model 	 O
takes 	 O
around 	 O
five 	 O
days 	 O
to 	 O
train 	 O
on 	 O
the 	 O
1.6 	 O
M 	 O
training 	 O
sentences 	 O
. 	 O
2 	 O
. 	 O
Candidat

The 	 O
scores 	 O
presented 	 O
are 	 O
significantly 	 O
different 	 O
( 	 O
p 	 O
< 	 O
0.05 	 B-MetricValue
) 	 O
from 	 O
the 	 O
respective 	 O
baseline 	 O
. 	 O
CHRF1 	 O
refers 	 O
to 	 O
character 	 O
n 	 O
- 	 O
gram 	 O
F1 	 B-MetricName
score 	 I-MetricName
( 	 O
Popović 	 O
, 	 O
2015 	 O
) 	 O
. 	 O
The 	 O
models 	 O
in 	 O
italics 	 O
are 	 O
ours 	 O
. 	 O
Overall 	 O
, 	 O
our 	 O
method 	 O
enhances 	 O
the 	 O
lexical 	 O
- 	 O
level 	 O
information 	 O
captured 	 O
by 	 O
pretrained 	 O
MLMs 	 O
, 	 O
as 	 O
shown 	 O
empirically 	 O
. 	 O
This 	 O
is 	 O
consistent 	 O
with 	 O
our 	 O
intuition 	 O
that 	 O
cross 	 O
- 	 O
lingual 	 O
embeddings 	 O
capture 	 O
a 	 O
bilingual 	 O
signal 	 O
that 	 O
can 	 O
benefit 	 O
MLM 	 B-DatasetName
representations 	 O
. 	 O
1 	 O
- 	 O
gram 	 O
precision 	 O
scores 	 O
. 	 O
To 	 O
examine 	 O
whether 	 O
the 	 O
improved 	 O
translation 	 O
performance 	 O
is 	 O
a 	 O
result 	 O
of 	 O
the 	 O
lexical 	 O
- 	 O
level 	

For 	 O
the 	 O
CoNLL 	 O
dataset 	 O
, 	 O
we 	 O
also 	 O
test 	 O
the 	 O
performance 	 O
using 	 O
PPRforNED 	 O
entity 	 O
candidates 	 O
( 	 O
Pershina 	 O
et 	 O
al 	 O
. 	 O
, 	 O
2015 	 O
) 	 O
. 	 O
We 	 O
report 	 O
the 	 O
in 	 O
- 	 O
KB 	 O
accuracy 	 B-MetricName
for 	 O
the 	 O
CoNLL 	 O
dataset 	 O
and 	 O
the 	 O
micro 	 B-MetricName
F1 	 I-MetricName
score 	 O
( 	 O
averaged 	 O
per 	 O
mention 	 O
) 	 O
for 	 O
the 	 O
other 	 O
datasets 	 O
. 	 O
Further 	 O
details 	 O
of 	 O
the 	 O
datasets 	 O
are 	 O
provided 	 O
in 	 O
Appendix 	 O
C. 	 O
Furthermore 	 O
, 	 O
we 	 O
optionally 	 O
fine 	 O
- 	 O
tune 	 O
the 	 O
model 	 O
by 	 O
maximizing 	 O
the 	 O
log 	 O
likelihood 	 O
of 	 O
the 	 O
ED 	 O
predictions 	 O
( 	 O
ŷ 	 O
ED 	 O
) 	 O
using 	 O
the 	 O
training 	 O
set 	 O
of 	 O
the 	 O
CoNLL 	 O
dataset 	 O
with 	 O
the 	 O
KB+YAGO 	 O
candidates 	 O
. 	 O
We 	 O
mask 	 O
90 	 O
% 	 O
of 	 O
the 	 O
mentions 	 O
and 	 O
fix 	 O
the 	 O
entity 	 O
token 	 

one 	 O
pass 	 O
through 	 O
the 	 O
transformer 	 O
for 	 O
k 	 O
noise 	 O
samples 	 O
and 	 O
n 	 O
− 	 O
k 	 O
data 	 O
samples 	 O
. 	 O
However 	 O
, 	 O
this 	 O
procedure 	 O
only 	 O
truly 	 O
minimizes 	 O
L 	 O
ifp 	 O
θ 	 B-HyperparameterName
( 	 O
x 	 O
t 	 O
|x 	 O
\t 	 O
) 	 O
= 	 O
p 	 O
θ 	 B-HyperparameterName
( 	 O
x 	 O
t 	 O
|x 	 O
noised 	 O
\t 	 O
) 	 O
. 	 O
To 	 O
apply 	 O
this 	 O
efficiency 	 O
trick 	 O
we 	 O
are 	 O
making 	 O
the 	 O
assumption 	 O
they 	 O
are 	 O
approximately 	 O
equal 	 O
, 	 O
which 	 O
we 	 O
argue 	 O
is 	 O
reasonable 	 O
because 	 O
( 	 O
1 	 O
) 	 O
we 	 O
choose 	 O
a 	 O
small 	 O
k 	 O
of 	 O
0.15n 	 O
and 	 O
( 	 O
2 	 O
) 	 O
q 	 O
is 	 O
trained 	 O
to 	 O
be 	 O
close 	 O
to 	 O
the 	 O
data 	 O
distribution 	 O
( 	 O
see 	 O
below 	 O
) 	 O
. 	 O
This 	 O
efficiency 	 O
trick 	 O
is 	 O
analogous 	 O
to 	 O
BERT 	 B-MethodName
masking 	 O
out 	 O
multiple 	 O
tokens 	 O
per 	 O
input 	 O
sequence 	 O
. 	 O
While 	 O
ELE

All 	 O
datasets 	 O
and 	 O
their 	 O
splits 	 O
used 	 O
in 	 O
the 	 O
experiments 	 O
are 	 O
listed 	 O
in 	 O
Table 	 O
1 	 O
. 	 O
We 	 O
will 	 O
explain 	 O
each 	 O
of 	 O
them 	 O
in 	 O
the 	 O
following 	 O
sections 	 O
. 	 O
For 	 O
each 	 O
classification 	 O
task 	 O
, 	 O
we 	 O
ran 	 O
and 	 O
improved 	 O
three 	 O
models 	 O
, 	 O
using 	 O
different 	 O
random 	 O
seeds 	 B-DatasetName
, 	 O
independently 	 O
of 	 O
one 	 O
another 	 O
, 	 O
and 	 O
the 	 O
reported 	 O
results 	 O
are 	 O
the 	 O
average 	 O
of 	 O
the 	 O
three 	 O
runs 	 O
. 	 O
Regarding 	 O
the 	 O
models 	 O
, 	 O
we 	 O
used 	 O
1D 	 O
CNNs 	 O
with 	 O
the 	 O
same 	 O
structures 	 O
for 	 O
all 	 O
the 	 O
tasks 	 O
and 	 O
datasets 	 O
. 	 O
The 	 O
convolution 	 B-MethodName
layer 	 O
had 	 O
three 	 O
filter 	 O
sizes 	 O
[ 	 O
2 	 O
, 	 O
3 	 O
, 	 O
4 	 O
] 	 O
with 	 O
10 	 O
filters 	 O
for 	 O
each 	 O
size 	 O
( 	 O
i.e. 	 O
, 	 O
d 	 O
= 	 O
10 	 O
× 	 O
3 	 O
= 	 O
30 	 O
) 	 O


where 	 O
x 	 O
k 	 O
is 	 O
the 	 O
value 	 O
of 	 O
the 	 O
neuron 	 O
k 	 O
, 	 O
g 	 O
is 	 O
a 	 O
nonlinear 	 O
activation 	 B-HyperparameterName
function 	 I-HyperparameterName
, 	 O
w 	 O
jk 	 O
and 	 O
b 	 O
k 	 O
are 	 O
weights 	 O
and 	 O
bias 	 O
in 	 O
the 	 O
network 	 O
, 	 O
respectively 	 O
. 	 O
We 	 O
can 	 O
see 	 O
that 	 O
the 	 O
contribution 	 O
of 	 O
a 	 O
single 	 O
node 	 O
j 	 O
to 	 O
the 	 O
value 	 O
of 	 O
the 	 O
node 	 O
k 	 O
is 	 O
The 	 O
results 	 O
of 	 O
the 	 O
extra 	 O
BiLSTM 	 B-MethodName
experiments 	 O
are 	 O
shown 	 O
in 	 O
Table 	 O
4 	 O
and 	 O
5 	 O
. 	 O
Table 	 O
4 	 O
shows 	 O
unexpected 	 O
results 	 O
after 	 O
disabling 	 O
features 	 O
. 	 O
For 	 O
instance 	 O
, 	 O
disabling 	 O
rank 	 O
B 	 O
features 	 O
caused 	 O
a 	 O
larger 	 O
performance 	 O
drop 	 O
than 	 O
removing 	 O
rank 	 O
A 	 O
features 	 O
. 	 O
This 	 O
suggests 	 O
that 	 O
how 	 O
we 	 O
created 	 O
word 	 O
clouds 	 O
for 	 O
each 	 O
BiLSTM 	 B-Me

In [6]:
with open(os.path.join(path_to_test_data, filename[:-4]+'_ngram.conll'), 'w') as f:
    
    for prediction in predictions:
        
        for idx, token in enumerate(prediction['text'].split(' ')):
            tag = prediction['entities'][idx]
            f.write(f'{token}\t{tag}\n')
            
        f.write('\n')