In [1]:
from pathlib import Path
import re
import torch
from transformers import TrainingArguments


def read_lsconll(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
                        
        tokens = []
        tags = []
        for line in doc.split('\n'):
            try:
                token, _, _, tag = line.split(' ')
            except ValueError:
                token, _, tag = line.split(' ')
                
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

def read_conll(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
                
        tokens = []
        tags = []
        for line in doc.split('\n'):
            try:
                token, tag = line.split('\t')
            except ValueError:
                token, tag = line.split(' ')
                
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [2]:
from simpletransformers.ner import NERModel, NERArgs
from itertools import product

model_args = NERArgs()

labels_prefixes = ["B-", "I-"]
labels_suffixes = ["MethodName", "HyperparameterName", "HyperparameterValue",
                          "MetricName", "MetricValue", "TaskName", "DatasetName"]

labels_list = [ ''.join(x) for x in product(labels_prefixes, labels_suffixes)] + ["O"]

model_args.labels_list = labels_list

model_args.train_batch_size = 32
model_args.evaluate_during_training = True
model_args.num_train_epochs = 10
model_args.learning_rate = 1e-4
model_args.early_stopping_consider_epochs = True
model_args.max_seq_length = 512

model = NERModel(
    "bert", "allenai/scibert_scivocab_cased", use_cuda=False, args=model_args
)

if torch.backends.mps.is_available():
    model.device = 'mps'

NOTE: Redirects are currently not supported in Windows or MacOs.
Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

In [3]:
import pandas as pd
import os

train_data_paths = ["../data/final_dataset/manual_formatted/train/",
                    "../data/final_dataset/auto/train/rest.conll", 
                    "../data/final_dataset/auto/train/metric_hp_plus_num_tag.conll",]

eval_data_path = "../data/final_dataset/manual_formatted/test/"


def get_df_from_conll(paths):
    
    if not isinstance(paths, list):
        paths = [paths]
    
    df = []
    sentence_num = 0
    
    for path in paths:
        if os.path.isdir(path):
            filenames = os.listdir(path)
            filepaths = [os.path.join(path, filename) for filename in filenames]
        else:
            filepaths = [path]

        for filepath in filepaths:
            data = read_conll(filepath)

            for sample_idx in range(len(data[0])):
                for token_idx in range(len(data[0][sample_idx])):
                    df.append([sentence_num, data[0][sample_idx][token_idx], data[1][sample_idx][token_idx]])

                sentence_num += 1
    
    df = pd.DataFrame(df, columns=["sentence_id", "words", "labels"])
    return df

train_data = get_df_from_conll(train_data_paths)
eval_data = get_df_from_conll(eval_data_path)

In [4]:
model.train_model(train_data, eval_data=eval_data)

  return [


  0%|          | 0/8 [00:00<?, ?it/s]

NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.
NOTE: Redirects are currently not supported in Windows or MacOs.


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/115 [00:00<?, ?it/s]

In [None]:
result, model_outputs, preds_list = model.eval_model(eval_data)

In [5]:
test_sub_data = []

path_to_test_data = '../data/test_submission_dataset/'
filename = 'anlp-sciner-test.txt'

with open(os.path.join(path_to_test_data, filename)) as f:
    for line in f:
        test_sub_data.append(line)

In [None]:
predictions, raw_outputs = model.predict(test_sub_data)

In [None]:
with open(os.path.join(path_to_test_data, filename[:-4]+'.conll'), 'w') as f:
    
    for prediction in predictions:
        
        print(len)
        
        for elem in prediction:
            token = list(elem.keys())[0]
            tag = elem[token]
            f.write(f'{token}\t{tag}\n')
        f.write('\n')