In [23]:
import sys

datasetName = sys.argv[1] #'airports' # hospital, ncvoter, inspection
import os
import pandas as pd

data_prefix = '../../REEs_model_data/revision/labeled_data/'
rules_set_path = os.path.join(data_prefix, datasetName, 'train', 'rules.txt')
model_checkpoint = 'distilbert-base-uncased'
saved_model_path = os.path.join('./saved_models', datasetName + '-' + model_checkpoint + '-mlm')

In [24]:
# load set of rules
rules_set = pd.read_csv(rules_set_path)
rules_set = rules_set.values[:, 0]

FileNotFoundError: [Errno 2] No such file or directory: '../../REEs_model_data/revision/labeled_data/--ip=127.0.0.1/train/rules.txt'

In [None]:
from datasets import load_dataset

rules_set = load_dataset('csv', data_files=rules_set_path)
rules_set

Using custom data configuration default-cca786c71c79b796
Reusing dataset csv (/home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)


DatasetDict({
    train: Dataset({
        features: ['rule', 'support_ratio', 'confidence', 'conciseness'],
        num_rows: 486
    })
})

In [None]:
# set gpu
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    result = tokenizer(examples['rule'])
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

tokenized_datasets = rules_set.map(tokenize_function, batched=True, remove_columns=['rule', 'support_ratio', 'confidence', 'conciseness'])

Loading cached processed dataset at /home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-5a360ebe5aa2f83c.arrow


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'word_ids'],
        num_rows: 486
    })
})

In [None]:
def group_texts(examples):
    chunk_size = 128
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items()}
    # create a new labels column
    result['labels'] = result['input_ids'].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Loading cached processed dataset at /home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-4dfbc0bca0b56071.arrow


In [None]:
tokenizer.decode(lm_datasets['train'][1]['input_ids'])

't1. iso _ country [SEP] [CLS] airports ( t0 ) ^ airports ( t1 ) ^ t0. latitude _ deg = = t1. latitude _ deg - > t0. iata _ code = = t1. iata _ code [SEP] [CLS] airports ( t0 ) ^ airports ( t1 ) ^ t0. latitude _ deg = = t1. latitude _ deg ^ t0. municipality = = t1. municipality - > t0. type = = t1. type [SEP] [CLS] airports ( t0 ) ^ airports ( t1 ) ^ t0. iso _ country = = br'

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
#train_size = 1000 #len(lm_datasets['train'])
#test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets['train'].train_test_split(train_size=0.8, test_size=0.2, seed=42)
downsampled_dataset

Loading cached split indices for dataset at /home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-72d4dbbfbb9374ae.arrow and /home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-31f4672453c70213.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 168
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 42
    })
})

In [None]:
from transformers import TrainingArguments

batch_size = 32
logging_steps = len(downsampled_dataset['train']) // batch_size

training_args = TrainingArguments(output_dir=saved_model_path,
                            overwrite_output_dir=True,
                            evaluation_strategy='epoch',
                            learning_rate=2e-5,
                            weight_decay=0.01,
                            num_train_epochs=20,
                            per_device_train_batch_size=batch_size,
                            per_device_eval_batch_size=batch_size,
                            push_to_hub=False,
                            fp16=True,
                            logging_steps=logging_steps)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
from transformers import AutoModelForMaskedLM, Trainer

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

trainer = Trainer(model=model, args=training_args, train_dataset=downsampled_dataset['train'], eval_dataset=downsampled_dataset['test'], data_collator=data_collator)

Using amp fp16 backend


In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    return {'masked_' + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
downsampled_dataset = downsampled_dataset.remove_columns(['word_ids'])
eval_dataset = downsampled_dataset['test'].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset['test'].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels"
    }
)

Loading cached processed dataset at /home/yaoshuw/.cache/huggingface/datasets/csv/default-cca786c71c79b796/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-318c549b2a5cae12.arrow


In [None]:

os.environ["WANDB_DISABLED"] = "true"

In [None]:
import math
eval_results = trainer.evaluate()
eval_results

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 32


{'eval_loss': 1.5600415468215942,
 'eval_runtime': 0.0734,
 'eval_samples_per_second': 572.344,
 'eval_steps_per_second': 27.254}

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running training *****
  Num examples = 168
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 18


Epoch,Training Loss,Validation Loss
1,1.6624,0.880667
2,1.0178,0.740922
3,0.7657,0.78565


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18, training_loss=1.072246299849616, metrics={'train_runtime': 2.3745, 'train_samples_per_second': 212.255, 'train_steps_per_second': 7.581, 'total_flos': 16702705668096.0, 'train_loss': 1.072246299849616, 'epoch': 3.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 32


{'eval_loss': 0.620425283908844,
 'eval_runtime': 0.0589,
 'eval_samples_per_second': 713.251,
 'eval_steps_per_second': 33.964,
 'epoch': 3.0}

In [None]:
trainer.save_model()

Saving model checkpoint to ./saved_models/airports-distilbert-base-uncased-mlm
Configuration saved in ./saved_models/airports-distilbert-base-uncased-mlm/config.json
Model weights saved in ./saved_models/airports-distilbert-base-uncased-mlm/pytorch_model.bin
