In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [3]:
# Load your data into a Hugging Face Dataset
data = load_dataset('csv', data_files='../data/intermediate/filtered_data.tsv', sep='\t')['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Tokenization

This tokenizer will convert text into a sequence of integers, where each integer represents a specific token as understood by the transformer model.
- model_inputs would have input_ids and attention_mask, which are fed to the model.
- labels are the expected correct outputs during training, which the model will learn to predict.
- In a sequence-to-sequence model like T5, during training, the model uses attention_mask to know which tokens are padding and which are not.

In [4]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    return tokenizer(examples['reference'], examples['translation'], 
                     max_length=128, truncation=True, padding='max_length')

def prepare_data(examples):
    # Tokenize the reference texts
    model_inputs = tokenizer(examples["reference"], max_length=128, truncation=True, padding="max_length")

    # Tokenize the translation texts with the same tokenizer but do not pad yet, as we need raw token ids for labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["translation"], max_length=128, truncation=True)["input_ids"]

    # Pad labels to max_length
    labels = [label + [tokenizer.pad_token_id] * (128 - len(label)) for label in labels]

    model_inputs["labels"] = labels

    return model_inputs

tokenized_data = data.map(tokenize_function, batched=True)
model_data = tokenized_data.map(prepare_data, batched=True)

print(model_data.column_names)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



Map:   0%|          | 0/336256 [00:00<?, ? examples/s]

Map:   0%|          | 0/336256 [00:00<?, ? examples/s]



['reference', 'translation', 'ref_tox', 'trn_tox', 'input_ids', 'attention_mask', 'labels']


In [11]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import load_dataset, DatasetDict

# Assuming 'model_data' is your tokenized and prepared dataset
columns_to_remove = ['reference', 'translation', 'ref_tox', 'trn_tox']
for column in columns_to_remove:
    if column in model_data.features:
        model_data = model_data.remove_columns(column)

# Now model_data should only contain the columns necessary for training
print(model_data.column_names)

# Update the dataset split since we modified model_data
dataset = DatasetDict({
    'train': model_data.train_test_split(test_size=0.1)['train'],
    'validation': model_data.train_test_split(test_size=0.1)['test']
})

# Initialize the T5 model for sequence-to-sequence LM
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Data collator used for dynamically padding the inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)


['input_ids', 'attention_mask', 'labels']


In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2084,0.190702
2,0.2017,0.185323
3,0.1994,0.183889


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


TrainOutput(global_step=28374, training_loss=0.23742393342537954, metrics={'train_runtime': 2658.1415, 'train_samples_per_second': 341.551, 'train_steps_per_second': 10.674, 'total_flos': 3.071886703460352e+16, 'train_loss': 0.23742393342537954, 'epoch': 3.0})

In [13]:
# saving model
trainer.save_model('models/model 1')

In [14]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('models/model 1')
model.eval()
model.config.use_cache = False

In [16]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [19]:
inference_request = 'You idiot'
translate(model, inference_request,tokenizer)

eat my juicy fat cock


