In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install git+https://github.com/huggingface/transformers
!pip install datasets
!pip install sentencepiece
!pip install sacrebleu

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ndaviebi
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-ndaviebi
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 404 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |█████████████████████████████

## KB13

In [3]:
from transformers import LineByLineTextDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, load_metric, dataset_dict, DatasetDict
import torch
import numpy as np

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/ANLP21/data/labeled_regex_data/labeled_kb13.csv", split="train")
_dataset_dict = dataset.train_test_split(test_size=0.25)
train_val = _dataset_dict["train"].train_test_split(test_size=10/75)
train = train_val["train"]
val = train_val['test']
test = _dataset_dict["test"]
d = dataset_dict.DatasetDict({'train':train, 'val': val, 'test':test})
metric = load_metric("sacrebleu")

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Using custom data configuration default-d6f6847834d41d0b


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-d6f6847834d41d0b/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d6f6847834d41d0b/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

In [4]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

In [5]:
d

DatasetDict({
    train: Dataset({
        features: ['label', 'regex'],
        num_rows: 535
    })
    val: Dataset({
        features: ['label', 'regex'],
        num_rows: 83
    })
    test: Dataset({
        features: ['label', 'regex'],
        num_rows: 206
    })
})

In [6]:
max_input_length = 512
max_target_length = 128
def preprocess_function(dataset):
    inputs = dataset["label"]
    targets = dataset["regex"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

datasets = d.map(preprocess_function, batched=True)

for dataset in datasets:
  datasets[dataset] = datasets[dataset].remove_columns(["label", "regex"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
t5_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex/kb13"
t5_cuda_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13"
!mkdir -p $t5_path
!mkdir -p $t5_cuda_path

In [10]:
d.save_to_disk(t5_cuda_path)

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [12]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# batch_size = 64

# training_args = Seq2SeqTrainingArguments(
#     t5_path,
#     evaluation_strategy = "epoch",
#     learning_rate=3e-4,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     save_steps=100,
#     save_total_limit=10,
#     num_train_epochs=35,
#     predict_with_generate=True    
# )

# trainer = Seq2SeqTrainer(
#     model,
#     training_args,
#     train_dataset=datasets["train"],
#     eval_dataset=datasets["val"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

In [13]:
# trainer.train()

In [14]:
# trainer.save_model(t5_path)

In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 64

cuda_training_args = Seq2SeqTrainingArguments(
    t5_cuda_path,
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=100,
    save_total_limit=10,
    num_train_epochs=35,
    predict_with_generate=True    
)

cuda_trainer = Seq2SeqTrainer(
    model,
    cuda_training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
cuda_trainer.train()

***** Running training *****
  Num examples = 535
  Num Epochs = 35
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 315


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.216153,0.0,0.0
2,No log,0.776313,0.0,0.0
3,No log,0.517253,0.0,0.0
4,No log,0.366693,0.0,0.0
5,No log,0.265154,22.5512,14.4458
6,No log,0.24224,25.073,16.253
7,No log,0.218694,24.9199,14.253
8,No log,0.20457,27.056,14.3494
9,No log,0.196023,29.3371,14.4578
10,No log,0.188451,29.0211,14.0723


***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/checkpoint-100
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/checkpoint-100/config.json
Model weights saved in /content/drive

TrainOutput(global_step=315, training_loss=0.2813703991117932, metrics={'train_runtime': 506.4054, 'train_samples_per_second': 36.976, 'train_steps_per_second': 0.622, 'total_flos': 569221976064000.0, 'train_loss': 0.2813703991117932, 'epoch': 35.0})

In [17]:
cuda_trainer.save_model(t5_cuda_path)

Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/config.json
Model weights saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13/spiece.model


## NL-RX Synth

In [18]:
from transformers import LineByLineTextDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, load_metric, dataset_dict
import torch
import numpy as np

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/ANLP21/data/labeled_regex_data/labeled_nlrx_synth.csv", split="train")
_dataset_dict = dataset.train_test_split(test_size=0.25)
train_val = _dataset_dict["train"].train_test_split(test_size=10/75)
train = train_val["train"]
val = train_val['test']
test = _dataset_dict["test"]
d = dataset_dict.DatasetDict({'train':train, 'val': val, 'test':test})
metric = load_metric("sacrebleu")

loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-61a25f6569b0a17f/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-61a25f6569b0a17f/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

In [20]:
d

DatasetDict({
    train: Dataset({
        features: ['label', 'regex'],
        num_rows: 6500
    })
    val: Dataset({
        features: ['label', 'regex'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'regex'],
        num_rows: 2500
    })
})

In [21]:
max_input_length = 512
max_target_length = 128
def preprocess_function(dataset):
    inputs = dataset["label"]
    targets = dataset["regex"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

datasets = d.map(preprocess_function, batched=True)

for dataset in datasets:
  datasets[dataset] = datasets[dataset].remove_columns(["label", "regex"])

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [22]:
t5_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex/nlrx_synth"
t5_cuda_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth"
!mkdir -p $t5_path
!mkdir -p $t5_cuda_path

In [23]:
d.save_to_disk(t5_cuda_path)

Flattening the indices:   0%|          | 0/7 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

In [24]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [25]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# batch_size = 64

# training_args = Seq2SeqTrainingArguments(
#     t5_path,
#     evaluation_strategy = "epoch",
#     learning_rate=3e-4,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     save_steps=100,
#     save_total_limit=10,
#     num_train_epochs=35,
#     predict_with_generate=True    
# )

# trainer = Seq2SeqTrainer(
#     model,
#     training_args,
#     train_dataset=datasets["train"],
#     eval_dataset=datasets["val"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

In [26]:
# trainer.save_model(t5_path)

In [27]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 64

cuda_training_args = Seq2SeqTrainingArguments(
    t5_cuda_path,
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=100,
    save_total_limit=10,
    num_train_epochs=35,
    predict_with_generate=True    
)

cuda_trainer = Seq2SeqTrainer(
    model,
    cuda_training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [28]:
cuda_trainer.train()

***** Running training *****
  Num examples = 6500
  Num Epochs = 35
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3570


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.066866,70.1753,17.907
2,No log,0.030015,72.6918,18.136
3,No log,0.018093,73.6808,18.157
4,No log,0.013549,74.2635,18.154
5,0.192800,0.01314,74.2013,18.158
6,0.192800,0.010173,74.516,18.158
7,0.192800,0.008344,74.6135,18.162
8,0.192800,0.007474,74.6322,18.162
9,0.192800,0.007226,74.6697,18.16
10,0.016800,0.007183,74.7523,18.162


Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100/config.json
Model weights saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-100/spiece.model
Deleting older checkpoint [/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/checkpoint-2600] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/ch

TrainOutput(global_step=3570, training_loss=0.034986265149771, metrics={'train_runtime': 4472.184, 'train_samples_per_second': 50.87, 'train_steps_per_second': 0.798, 'total_flos': 5292075909120000.0, 'train_loss': 0.034986265149771, 'epoch': 35.0})

In [29]:
cuda_trainer.save_model(t5_cuda_path)

Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/config.json
Model weights saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth/spiece.model


## NL-RX Turk

In [30]:
from transformers import LineByLineTextDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, load_metric, dataset_dict
import torch
import numpy as np

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/ANLP21/data/labeled_regex_data/labeled_nlrx_turk.csv", split="train")
_dataset_dict = dataset.train_test_split(test_size=0.25)
train_val = _dataset_dict["train"].train_test_split(test_size=10/75)
train = train_val["train"]
val = train_val['test']
test = _dataset_dict["test"]
d = dataset_dict.DatasetDict({'train':train, 'val': val, 'test':test})
metric = load_metric("sacrebleu")

loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-c184eca6b8a84e7c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c184eca6b8a84e7c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

In [32]:
d

DatasetDict({
    train: Dataset({
        features: ['label', 'regex'],
        num_rows: 6500
    })
    val: Dataset({
        features: ['label', 'regex'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'regex'],
        num_rows: 2500
    })
})

In [33]:
max_input_length = 512
max_target_length = 128
def preprocess_function(dataset):
    inputs = dataset["label"]
    targets = dataset["regex"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

datasets = d.map(preprocess_function, batched=True)

for dataset in datasets:
  datasets[dataset] = datasets[dataset].remove_columns(["label", "regex"])

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [34]:
t5_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex/nlrx_turk"
t5_cuda_path = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk"
!mkdir -p $t5_path
!mkdir -p $t5_cuda_path

In [35]:
d.save_to_disk(t5_cuda_path)

Flattening the indices:   0%|          | 0/7 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

In [36]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [37]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# batch_size = 64

# training_args = Seq2SeqTrainingArguments(
#     t5_path,
#     evaluation_strategy = "epoch",
#     learning_rate=3e-4,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     save_steps=100,
#     save_total_limit=10,
#     num_train_epochs=35,
#     predict_with_generate=True    
# )

# trainer = Seq2SeqTrainer(
#     model,
#     training_args,
#     train_dataset=datasets["train"],
#     eval_dataset=datasets["val"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

In [38]:
# trainer.save_model(t5_path)

In [39]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 64

cuda_training_args = Seq2SeqTrainingArguments(
    t5_cuda_path,
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=100,
    save_total_limit=10,
    num_train_epochs=35,
    predict_with_generate=True    
)

cuda_trainer = Seq2SeqTrainer(
    model,
    cuda_training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [40]:
cuda_trainer.train()

***** Running training *****
  Num examples = 6500
  Num Epochs = 35
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3570


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.172671,57.1334,17.447
2,No log,0.110817,60.4252,17.803
3,No log,0.091354,62.0896,17.868
4,No log,0.081889,63.1512,17.857
5,0.296300,0.074104,63.4759,17.943
6,0.296300,0.069612,64.4333,17.978
7,0.296300,0.067262,64.4295,18.014
8,0.296300,0.066478,63.6137,17.879
9,0.296300,0.062792,64.151,17.925
10,0.087700,0.063234,64.4466,17.92


Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100/config.json
Model weights saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-100/spiece.model
Deleting older checkpoint [/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint-2600] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/checkpoint

TrainOutput(global_step=3570, training_loss=0.09285836627169483, metrics={'train_runtime': 4619.6974, 'train_samples_per_second': 49.246, 'train_steps_per_second': 0.773, 'total_flos': 5412350361600000.0, 'train_loss': 0.09285836627169483, 'epoch': 35.0})

In [41]:
cuda_trainer.save_model(t5_cuda_path)

Saving model checkpoint to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk
Configuration saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/config.json
Model weights saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk/spiece.model


# Inference

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np
t5_cuda_path_nlrx_turk = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_turk"
t5_cuda_path_nlrx_synth = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/nlrx_synth"
t5_cuda_path_kb13 = "/content/drive/MyDrive/ANLP21/t5_eng_to_regex_cuda/kb13"
import sys
sys.path.append('/content/drive/MyDrive/ANLP21')

### KB13

In [4]:
tokenizer = T5Tokenizer.from_pretrained(t5_cuda_path_kb13)
model = T5ForConditionalGeneration.from_pretrained(t5_cuda_path_kb13)

In [5]:
# model.to('cpu')
# input_ids = tokenizer("translate English to Regex: lines using 'an' before 'imal'", return_tensors='pt').input_ids
# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [6]:
# len(d['test'][4]['regex'])

In [7]:
from datasets import load_from_disk
d = load_from_disk(t5_cuda_path_kb13)
d['test']

Dataset({
    features: ['label', 'regex'],
    num_rows: 206
})

In [7]:
tokenizer.decode(d['test'][0]['input_ids'], skip_special_tokens=True)

"translate English to Regex: lines that do not contain the string 'dog' followed by a number, three times or more"

In [8]:
from datasets import load_dataset
full_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/ANLP21/data/labeled_regex_data/labeled_nlrx_turk.csv", split="train")
full_dataset

Using custom data configuration default-c184eca6b8a84e7c


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-c184eca6b8a84e7c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c184eca6b8a84e7c/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


Dataset({
    features: ['label', 'regex'],
    num_rows: 10000
})

In [9]:
tokenizer(full_dataset[i]['label'], return_tensors='pt').input_ids

NameError: ignored

In [None]:
!pip install tqdm



In [8]:
from regexDFAEquals import main
import tqdm
arr = []

dfa_num_correct = 0
for i in tqdm.trange(len(d['test'])):
# for i in range(len(dataset_dict['test'])):
  # print(dataset_dict['test'][i]['label'])
  input_ids = tokenizer(d['test'][i]['label'], return_tensors='pt').input_ids
  outputs = model.generate(input_ids)
  # print(dataset_dict['test'][i]['regex'])
  # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  correct = main(tokenizer.decode(outputs[0], skip_special_tokens=True), d['test'][i]['regex'])
  dfa_num_correct += correct
  arr.append((correct, len(d['test'][i]['regex']) < 15))
  # print(dfa_num_correct)

dfa_acc = dfa_num_correct / len(d['test'])
print(dfa_acc)
print(arr)

100%|██████████| 206/206 [01:22<00:00,  2.51it/s]

0.13106796116504854
[(0, True), (0, False), (0, True), (0, False), (0, False), (0, True), (0, True), (0, True), (0, False), (0, True), (0, False), (0, True), (0, False), (0, True), (0, False), (0, False), (1, True), (0, False), (0, False), (0, False), (0, False), (0, False), (1, True), (1, True), (0, True), (0, True), (0, True), (0, False), (1, False), (0, False), (0, False), (0, True), (0, False), (1, True), (0, False), (0, False), (0, False), (0, False), (0, True), (0, True), (0, False), (0, True), (0, False), (0, False), (0, True), (0, True), (0, True), (0, False), (0, False), (0, False), (0, False), (0, False), (0, False), (0, True), (0, False), (0, True), (1, True), (0, False), (0, False), (0, True), (1, True), (0, False), (0, False), (0, False), (0, False), (0, False), (0, False), (0, True), (0, False), (0, False), (0, True), (0, False), (1, True), (0, False), (0, False), (0, False), (0, False), (0, False), (0, True), (0, False), (0, False), (0, False), (0, False), (0, False), (0




In [9]:
len([i for i in arr if i[0] == 1])

27

In [10]:
len([j for j in [i for i in arr if i[1]] if j[0]==1]) / len([i for i in arr if i[1]]) # accuracy on only regexes < 15 in length

0.2987012987012987

In [10]:
from regexDFAEquals import main
import tqdm
arr = []

dfa_num_correct = 0
for i in tqdm.trange(len(d['test'])):
# for i in range(len(dataset_dict['test'])):
  # print(dataset_dict['test'][i]['label'])
  input_ids = d['test'][i]['input_ids']
  outputs = model.generate(torch.tensor([input_ids]))
  for j in range(len(full_dataset)):
    if full_dataset[j]['label'] == tokenizer.decode(input_ids, skip_special_tokens=True):
        break
  regex = full_dataset[j]['regex']
  # print(dataset_dict['test'][i]['regex'])
  # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  correct = main(decoded_output, regex)
  print(decoded_output, regex, correct)
  dfa_num_correct += correct
  arr.append((correct, len(regex) < 15))
  # print(dfa_num_correct)

dfa_acc = dfa_num_correct / len(d['test'])
print(dfa_acc)


  0%|          | 1/2500 [03:03<127:28:42, 183.64s/it]

((dog.*[0-9].*)3,) ~((dog.*[0-9].*){3,}) 0


  0%|          | 2/2500 [03:13<56:36:04, 81.57s/it]  

(([A-Za-z])|([a-z]) (~([a-z]))&(([0-9])*) 0


  0%|          | 3/2500 [03:27<35:14:21, 50.81s/it]

(([0-9])|(.)).*([AEIOUa (([0-9])|(.)).*([AEIOUaeiou]).* 0


  0%|          | 3/2500 [05:56<82:22:58, 118.77s/it]


KeyboardInterrupt: ignored

In [None]:
dfa_acc

0.059629135862319715

In [None]:
count = 0
for i in range(len(d['test'])):
    if len(d['test'][i]['regex']) < 15:
        count += 1
count

138

In [None]:
len(d['test'])

2500

0.36893203883495146

In [None]:
138/2500

0.0552