In [19]:
# End-to-end script running the Hugging Face Trainer
# for causal language modeling. Based on the Tasks documentation
# originally from: https://hf.co/docs/transformers/tasks/language_modeling
from accelerate import PartialState
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from util_functions import *
from config_train import *

In [21]:
from evaluate import load, Metric

In [22]:
# Define the evaluation metric
metric = load("perplexity")

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

In [20]:
print (config_obj['model_name'])

distilbert/distilgpt2


In [3]:
# take a base model from configuration.
model_name = config_obj['model_name']

In [4]:
dataset = load_dataset("json", data_files={"train" : "email.jsonl","test" : "email2.jsonl" })

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['type', 'text'],
        num_rows: 6
    })
    test: Dataset({
        features: ['type', 'text'],
        num_rows: 6
    })
})

In [6]:
# Tokenize the dataset
tokenizer = get_tokenizer(model_name)



In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [8]:
print(f"Tokenizing dataset for {model_name}...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Tokenizing dataset for distilbert/distilgpt2...


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [9]:
# And apply
tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [10]:
# Create an efficient collator which dynamically pads
# End-of-sequence as the padding token and mlm=False will
# use the inputs as labels, shifted to the right by one element
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
print(f"Instantiating model ({model_name})...")
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the hyperparameters in the TrainingArguments
print("Creating training arguments (weights are stored at `results/causal_language_modeling`)...")

Instantiating model (distilbert/distilgpt2)...
Creating training arguments (weights are stored at `results/causal_language_modeling`)...


In [12]:
# for small dataset - adjusting batch size 
training_args = TrainingArguments(
    output_dir="results/causal_language_modeling",  # Where weights are stored
    learning_rate=1e-5,  # The learning rate during training
    per_device_train_batch_size=2,  # Number of samples per batch during training
    per_device_eval_batch_size=2,  # Number of samples per batch during evaluation
    num_train_epochs=10,  # How many iterations through the dataloaders should be done
    weight_decay=0.01,  # Regularization penalization
    evaluation_strategy="epoch",  # How often metrics on the evaluation dataset should be computed
    save_strategy="epoch",  # When to try and save the best model (such as a step number or every iteration)

)

In [13]:
# Create the `Trainer`, passing in the model and arguments
# the datasets to train on, how the data should be collated,
# and the method for computing our metrics
print("Creating `Trainer`...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

Creating `Trainer`...


In [14]:
# Initiate training
print("Training...")
trainer.train()

Training...


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.271400451660156, 'eval_runtime': 0.113, 'eval_samples_per_second': 8.852, 'eval_steps_per_second': 8.852, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.168320178985596, 'eval_runtime': 0.1127, 'eval_samples_per_second': 8.876, 'eval_steps_per_second': 8.876, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.079220294952393, 'eval_runtime': 0.1161, 'eval_samples_per_second': 8.612, 'eval_steps_per_second': 8.612, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.005306720733643, 'eval_runtime': 0.1002, 'eval_samples_per_second': 9.976, 'eval_steps_per_second': 9.976, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.9435272216796875, 'eval_runtime': 0.0973, 'eval_samples_per_second': 10.275, 'eval_steps_per_second': 10.275, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.894034385681152, 'eval_runtime': 0.1202, 'eval_samples_per_second': 8.322, 'eval_steps_per_second': 8.322, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.854833602905273, 'eval_runtime': 0.1192, 'eval_samples_per_second': 8.388, 'eval_steps_per_second': 8.388, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.825794696807861, 'eval_runtime': 0.147, 'eval_samples_per_second': 6.805, 'eval_steps_per_second': 6.805, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.806646823883057, 'eval_runtime': 0.1153, 'eval_samples_per_second': 8.672, 'eval_steps_per_second': 8.672, 'epoch': 9.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.7971415519714355, 'eval_runtime': 0.132, 'eval_samples_per_second': 7.575, 'eval_steps_per_second': 7.575, 'epoch': 10.0}
{'train_runtime': 22.177, 'train_samples_per_second': 0.451, 'train_steps_per_second': 0.451, 'train_loss': 5.16423225402832, 'epoch': 10.0}


TrainOutput(global_step=10, training_loss=5.16423225402832, metrics={'train_runtime': 22.177, 'train_samples_per_second': 0.451, 'train_steps_per_second': 0.451, 'total_flos': 267931238400.0, 'train_loss': 5.16423225402832, 'epoch': 10.0})

In [15]:
# Performing inference
text = "p2go has mostly "
# We need to tokenize the inputs and turn them to PyTorch tensors
encoded_input = tokenizer(text, return_tensors="pt")

In [16]:
#encoded_input = { 'input_ids' : encoded_input.input_ids , 'attention_mask' : encoded_input.attention_mask}

In [None]:
# To move the batch to the right device automatically, use `PartialState().device`
# which will always work no matter the environment
# encoded_input = encoded_input['input_ids'].to(PartialState().device)
# encoded_input.to("cuda")

In [17]:
# Then we can perform inference via `model.generate`:
print("Performing inference...")
outputs = model.generate(encoded_input['input_ids'], max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Performing inference...


In [18]:
# Finally, decode our outputs
print(f"Prediction: {tokenizer.batch_decode(outputs, skip_special_tokens=True)}")

Prediction: ['p2go has mostly irc servers, the server has been a little much smaller. It can\'t run all the servers. It\'s only a bit slower than a normal web server. But I mean how fast do we need to run servers? I just have to give my name. This is the code in the README file of the project. It goes as follows:\n\n\nimport { "git clone https://github.com/xrudy-sip/xrudy-sip/']
