In [98]:
from transformers import BertGenerationConfig, BertTokenizer, BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
from tokenizers import ByteLevelBPETokenizer
import numpy as np
import torch
import pandas as pd

In [99]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['./dataset/final/dev.csv', './dataset/final/train.csv', './dataset/final/test.csv'])






In [3]:
encoder_config = BertGenerationConfig(vocab_size=tokenizer.get_vocab_size(),
                                      hidden_size=256,
                                      num_hidden_layers=3,
                                      num_attention_heads=8,
                                      intermediate_size=512,
                                      hidden_act='relu',
                                      is_decoder=False)
decoder_config = BertGenerationConfig(vocab_size=tokenizer.get_vocab_size(),
                                      hidden_size=256,
                                      num_hidden_layers=3,
                                      num_attention_heads=8,
                                      intermediate_size=512,
                                      hidden_act='relu',
                                      is_decoder=True,
                                      add_cross_attention=True)

In [4]:
encoder = BertGenerationEncoder(encoder_config)
decoder = BertGenerationDecoder(decoder_config)

In [5]:
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

In [33]:
sources = tokenizer.encode_batch(['document_id int location_code str date_in_location_from str date_in_locaton_to str | Display a bar chart for what is the code of each location and the number of documents in that location ? , rank in descending by the y-axis .'])
labels = tokenizer.encode_batch(['mark bar data document_locations encoding x location_code y aggregate count location_code transform group x sort y desc'])

source_ids = torch.Tensor([source.ids for source in sources]).to(torch.int64)
labels_ids = torch.Tensor([label.ids for label in labels]).to(torch.int64)

loss = model(input_ids=source_ids, decoder_input_ids=labels_ids, labels=labels_ids).loss
loss.backward()

In [114]:
from datasets import Dataset

train_df = pd.read_csv('./dataset/final/train.csv', usecols=['source', 'label'])
train_ds = Dataset.from_pandas(train_df)

test_df = pd.read_csv('./dataset/final/train.csv', usecols=['source', 'label'])
test_ds = Dataset.from_pandas(test_df)

In [115]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [116]:
def preprocess(examples):
    model_inputs = tokenizer(examples['source'], max_length=128, truncation=True)
    labels = tokenizer(examples['label'], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [117]:
train = train_ds.map(preprocess_function, batched=True)
test = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [119]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [118]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [120]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.