In [1]:
# End-to-end script running the Hugging Face Trainer
# for causal language modeling. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/language_modeling
from accelerate import PartialState
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

In [2]:
# take a base model
model_name = "distilgpt2"

In [3]:
dataset = load_dataset("json", data_files={"train" : "email.jsonl","test" : "email2.jsonl" })

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['type', 'text'],
        num_rows: 6
    })
    test: Dataset({
        features: ['type', 'text'],
        num_rows: 6
    })
})

In [5]:
# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [7]:
print(f"Tokenizing dataset for {model_name}...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Tokenizing dataset for distilgpt2...


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [8]:
# We still need to concatenate our sequences
# and split them into shorter chunks to ease
# minimal RAM usage
block_size = 128

In [9]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
# And apply
tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [11]:
# Create an efficient collator which dynamically pads
# End-of-sequence as the padding token and mlm=False will
# use the inputs as labels, shifted to the right by one element
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
print(f"Instantiating model ({model_name})...")
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the hyperparameters in the TrainingArguments
print("Creating training arguments (weights are stored at `results/causal_language_modeling`)...")

Instantiating model (distilgpt2)...
Creating training arguments (weights are stored at `results/causal_language_modeling`)...


In [14]:
# for small dataset - adjusting batch size 
training_args = TrainingArguments(
    output_dir="results/causal_language_modeling",  # Where weights are stored
    learning_rate=1e-5,  # The learning rate during training
    per_device_train_batch_size=2,  # Number of samples per batch during training
    per_device_eval_batch_size=2,  # Number of samples per batch during evaluation
    num_train_epochs=5,  # How many iterations through the dataloaders should be done
    weight_decay=0.01,  # Regularization penalization
    evaluation_strategy="epoch",  # How often metrics on the evaluation dataset should be computed
    save_strategy="epoch",  # When to try and save the best model (such as a step number or every iteration)

)

In [15]:
# Create the `Trainer`, passing in the model and arguments
# the datasets to train on, how the data should be collated,
# and the method for computing our metrics
print("Creating `Trainer`...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

Creating `Trainer`...


In [16]:
# Initiate training
print("Training...")
trainer.train()

Training...


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.271400451660156, 'eval_runtime': 0.2004, 'eval_samples_per_second': 4.991, 'eval_steps_per_second': 4.991, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.179584980010986, 'eval_runtime': 0.3577, 'eval_samples_per_second': 2.796, 'eval_steps_per_second': 2.796, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.112208366394043, 'eval_runtime': 0.3502, 'eval_samples_per_second': 2.855, 'eval_steps_per_second': 2.855, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.069208145141602, 'eval_runtime': 0.3343, 'eval_samples_per_second': 2.992, 'eval_steps_per_second': 2.992, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.047971248626709, 'eval_runtime': 0.3206, 'eval_samples_per_second': 3.119, 'eval_steps_per_second': 3.119, 'epoch': 5.0}
{'train_runtime': 18.9472, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.264, 'train_loss': 5.349599456787109, 'epoch': 5.0}


TrainOutput(global_step=5, training_loss=5.349599456787109, metrics={'train_runtime': 18.9472, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.264, 'total_flos': 133965619200.0, 'train_loss': 5.349599456787109, 'epoch': 5.0})

In [21]:
# Performing inference
text = "p2go has mostly "
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt").input_ids

In [22]:
# To move the batch to the right device automatically, use `PartialState().device`
# which will always work no matter the environment
encoded_input = encoded_input.to(PartialState().device)
# Can also be `encoded_input.to("cuda")`

In [23]:
# Then we can perform inference via `model.generate`:
print("Performing inference...")
outputs = model.generate(encoded_input, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Performing inference...


In [24]:
# Finally, decode our outputs
print(f"Prediction: {tokenizer.batch_decode(outputs, skip_special_tokens=True)}")

Prediction: ['p2go has mostly ersatz-type methods but they are rather useful. Here is an example example of where the program has been compiled with:\n\nThe above code consists of the following method:\nThis is a method of getting the same message from a particular class and the same message. The following method provides the following method:\nIt is a generic (from above) method so it is very useful to use the same message using the same message. The following method provides the following method:\nIt is a']
