In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json

import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import wandb
from itertools import chain

import random

from types import SimpleNamespace
from pathlib import Path
from functools import partial
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from tokenizers import AddedToken
from datasets import Dataset, concatenate_datasets

from argparse import ArgumentParser
import argparse
from types import SimpleNamespace

import sys
sys.path.append('src')

from data import (
    get_reference_df,
    split_rows,
    create_dataset,
    add_token_indices,
    CustomDataset
)
from training import seed_everything, get_model, compute_metrics
from environment import (
    load_filepaths,
    load_config,
    add_run_specific_filepaths,
    concat_configs,
    namespace_to_dictionary,
    init_wandb
)


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def replace_labels(data):
    new_data = []
    for sample in data:
        for i in range(len(sample['labels'])):
            if sample['labels'][i] in ['B-INSTRUCTOR_NAME', 'I-INSTRUCTOR_NAME']:
                sample['labels'][i] = sample['labels'][i].replace('INSTRUCTOR_NAME', 'OTHER_NAME')
            if sample['labels'][i] in ['B-ORG_NAME', 'I-ORG_NAME', 'B-COUNTRY_NAME', 'I-COUNTRY_NAME']:
                sample['labels'][i] = 'O'
        new_data.append(sample)
    return new_data


def get_input_args():
    args = SimpleNamespace()
    args.exp_name = 'exp073'
    args.job_type = 'train'
    args.seed = 42
    args.debug = False
    args.pretrain_dataset = 'None'
    args.generated_dataset = 'None'
    args.prev_exp = 'None'
    args.pretrain_name = 'None'
    args.fold = 0
    return vars(args)


In [None]:
args = get_input_args()

args['pseudo_path'] = f'models/{args["pretrain_name"]}'
if 'exp026' in args["pretrain_name"]:
    args['pseudo_path'] = f'models2/{args["pretrain_name"]}'

config_fp = 'config.yaml'
config = load_config(config_fp)
filepaths = load_filepaths('filepaths.yaml')
config = concat_configs(args, config, filepaths)
config = add_run_specific_filepaths(config, args['exp_name'], args['job_type'], args['fold'], args['seed'])

run = init_wandb(config)

seed_everything(config.seed)
tokenizer = AutoTokenizer.from_pretrained(config.model.backbone_type)

label2id = {
    'B-EMAIL': 0, 'B-ID_NUM': 1, 'B-NAME_STUDENT': 2,
    'B-PHONE_NUM': 3, 'B-STREET_ADDRESS': 4, 'B-URL_PERSONAL': 5,
    'B-USERNAME': 6, 'I-ID_NUM': 7, 'I-NAME_STUDENT': 8,
    'I-PHONE_NUM': 9, 'I-STREET_ADDRESS': 10, 'I-URL_PERSONAL': 11,
    'O': 12,
}
id2label = {v:k for k,v in label2id.items()}

data = json.load(open("data/processed/train_with_folds.json"))
df = pd.DataFrame(data)
df['fold'] = df['document'] % 4
df['valid'] = df['fold'] == config.fold
df['token_indices'] = df['tokens'].apply(add_token_indices)
df['source'] = 'competition'

train_folds = df[~df.valid].copy().reset_index(drop=True)
valid_folds = df[df.valid].copy().reset_index(drop=True)
print(train_folds.shape, valid_folds.shape)
print(label2id)

reference_df = get_reference_df(valid_folds)

if config.generated_dataset != 'None':
    print('Using external dataset')
    external_data = json.load(open(f'data/external/{config.generated_dataset}'))
    external_data = pd.DataFrame(external_data)
    external_data['document'] = -1
    external_data.rename(columns={'labels': 'provided_labels'})
    external_data['token_indices'] = external_data['tokens'].apply(add_token_indices)
    external_data['source'] = 'nbroad'
    train_folds = pd.concat([train_folds, external_data])

if config.dataset.stride_train:
    train_folds = split_rows(train_folds, config.dataset.doc_max_length, config.dataset.doc_stride)
if config.dataset.stride_valid:
    valid_folds = split_rows(valid_folds, config.dataset.doc_max_length, config.dataset.doc_stride)

if config.dataset.filter_no_pii:
    train_folds['pii'] = train_folds['labels'].apply(lambda x: len(set(x)) > 1)
    pii = train_folds[train_folds['pii']].copy()
    no_pii = train_folds[(~train_folds['pii']) & (train_folds['source'] == 'competition')].copy()
    if no_pii.shape[0] > 0:
        no_pii['pii'] = no_pii['document'].apply(lambda x: random.random() < config.dataset.filter_no_pii_ratio)
        no_pii = no_pii[no_pii.pii]
        train_folds = pd.concat([pii, no_pii])
    else:
        train_folds = pii.copy()
    train_folds = train_folds.sort_index()

train_folds = train_folds.sample(frac=1, random_state=config.seed)

train_ds = CustomDataset(train_folds, tokenizer, config.dataset.inference_max_length, label2id)
valid_ds = create_dataset(valid_folds, tokenizer, config.dataset.inference_max_length, label2id)

print(len(train_ds))
print(len(valid_ds))

if config.pretrain_name == 'None':
    model_path = config.model.backbone_type
else:
    model_path = Path(config.pseudo_path)
print('State from: ', model_path)

model = get_model(config, model_path, id2label, label2id)
collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    config.run_dir,
    fp16=config.training.apex,
    learning_rate=config.optimizer.decoder_lr,
    weight_decay=config.optimizer.weight_decay,
    warmup_ratio=config.optimizer.warmup_ratio,
    per_device_train_batch_size=config.dataset.train_batch_size,
    per_device_eval_batch_size=config.dataset.valid_batch_size,
    report_to="none",
    lr_scheduler_type='cosine',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=20,
    metric_for_best_model="fbeta_best",
    greater_is_better=True,
    gradient_checkpointing=config.model.gradient_checkpointing,
    num_train_epochs=config.training.epochs,
    gradient_accumulation_steps=config.training.gradient_accumulation_steps,
    dataloader_num_workers=1,
    seed=config.seed,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, id2label=id2label, valid_ds=valid_ds, valid_df=reference_df, threshold=config.dataset.fbeta_postproc_thr),
)
trainer.train()