In [2]:
import re
import torch
from pathlib import Path
from itertools import product
import pandas as pd
import os

train_data_paths = ["../data/final_dataset/manual_formatted/train/",
                    "../data/final_dataset/auto/train/rest.conll", 
                    "../data/final_dataset/auto/train/metric_hp_plus_num_tag.conll",]

eval_data_path = "../data/final_dataset/manual_formatted/test/"

labels_prefixes = ["B-", "I-"]
labels_suffixes = ["MethodName", "HyperparameterName", "HyperparameterValue",
                          "MetricName", "MetricValue", "TaskName", "DatasetName"]
labels_list = ["O"] + [ ''.join(x[::-1]) for x in product(labels_suffixes, labels_prefixes)]
id2label = dict(zip(labels_list, range(len(labels_list))))


def read_conll(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
                
        tokens = []
        tags = []
        for line in doc.split('\n'):
            try:
                token, tag = line.split('\t')
            except ValueError:
                token, tag = line.split(' ')
                
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs


def get_df_from_conll(paths):
    
    if not isinstance(paths, list):
        paths = [paths]
    
    df = []
    sentence_num = 0
    
    for path in paths:
        if os.path.isdir(path):
            filenames = os.listdir(path)
            filepaths = [os.path.join(path, filename) for filename in filenames]
        else:
            filepaths = [path]

        for filepath in filepaths:
            data = read_conll(filepath)

            for sample_idx in range(len(data[0])):
                for token_idx in range(len(data[0][sample_idx])):
                    df.append([sentence_num, data[0][sample_idx][token_idx], data[1][sample_idx][token_idx]])

                sentence_num += 1
    
    df = pd.DataFrame(df, columns=["sentence_id", "words", "labels"])
    return df


def get_data_from_conll(paths, id2label=None):
    
    if not isinstance(paths, list):
        paths = [paths]
    
    dataset = []
    sentence_num = 0
    
    for path in paths:
        if os.path.isdir(path):
            filenames = os.listdir(path)
            filepaths = [os.path.join(path, filename) for filename in filenames]
        else:
            filepaths = [path]

        for filepath in filepaths:
            data = read_conll(filepath)
            
            for idx in range(len(data[0])):
                
                
                if id2label is not None:
                    ner_tags = list(map(lambda x: id2label.get(x), data[1][idx]))
                else:
                    ner_tags = None
                
                dataset.append({'tokens': data[0][idx], 'ner_labels': data[1][idx], 'ner_tags' : ner_tags})
            

    return dataset

print(id2label)

{'O': 0, 'B-MethodName': 1, 'I-MethodName': 2, 'B-HyperparameterName': 3, 'I-HyperparameterName': 4, 'B-HyperparameterValue': 5, 'I-HyperparameterValue': 6, 'B-MetricName': 7, 'I-MetricName': 8, 'B-MetricValue': 9, 'I-MetricValue': 10, 'B-TaskName': 11, 'I-TaskName': 12, 'B-DatasetName': 13, 'I-DatasetName': 14}


In [3]:
train_data = get_data_from_conll(train_data_paths, id2label)
eval_data = get_data_from_conll(eval_data_path, id2label)

In [4]:
from transformers import *

model_checkpoint = 'allenai/scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print(tokenizer.is_fast)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /Users/varunursekar/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading file vocab.txt from cache at /Users/varunursekar/.cache/huggingface/hub/models--allenai--scibert_scivocab

True


In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids(0)
    new_labels = align_labels_with_tokens(examples["ner_tags"], word_ids)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def tokenize_dataset(dataset, tokenizer):
    return list(map(lambda x: tokenize_and_align_labels(x, tokenizer), dataset))

In [16]:
tokenized_datasets = {}
tokenized_datasets['train'] = tokenize_dataset(train_data, tokenizer)
tokenized_datasets['eval'] = tokenize_dataset(eval_data, tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

eval_dataloader = DataLoader(
    tokenized_datasets["eval"], collate_fn=data_collator, batch_size=8
)

label2id = {v: k for k, v in id2label.items()}

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

loading configuration file config.json from cache at /Users/varunursekar/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "B-DatasetName": 13,
    "B-HyperparameterName": 3,
    "B-HyperparameterValue": 5,
    "B-MethodName": 1,
    "B-MetricName": 7,
    "B-MetricValue": 9,
    "B-TaskName": 11,
    "I-DatasetName": 14,
    "I-HyperparameterName": 4,
    "I-HyperparameterValue": 6,
    "I-MethodName": 2,
    "I-MetricName": 8,
    "I-MetricValue": 10,
    "I-TaskName": 12,
    "O": 0
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": "O",
    "1": "B-MethodName",
    "2": "I-MethodName",
    "3": "B-HyperparameterName",
    "4": 

In [19]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [20]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
print(accelerator.device)

cpu


In [21]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [22]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [23]:
import evaluate

metric = evaluate.load("seqeval")

In [24]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/1377 [00:00<?, ?it/s]