## Language model training (fine-tuning)

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AutoConfig
import torch
import math
import mlflow

### Prepare dataset

In [3]:
# load csv
jokes1 = pd.read_csv('./data/jokes.csv', usecols=['Question', 'Answer'])
jokes2 = pd.read_csv('./data/jokes_score_name_clean.csv', usecols=['q', 'a'])
df = jokes1.append(jokes2.rename(columns={"q": "Question", "a": "Answer"}), ignore_index=True)
df.head()

Unnamed: 0,Question,Answer
0,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,What do you call a person who is outside a doo...,Matt
3,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe


In [4]:
# concat question to answer and add special tokens
df['text'] = '<|question|> ' + df['Question'] + '\n<|answer|> ' + df['Answer'] + ' <|endoftext|>'

In [5]:
# load to dataset
dataset = Dataset.from_pandas(df[['text']]).train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 154437
    })
    test: Dataset({
        features: ['text'],
        num_rows: 17160
    })
})


### Causal Language modeling

In [8]:
# using pretrained model for transfer learning (GPT2)
model_checkpoint = "distilgpt2"

# initialize tokenizer (BPE for GPT2)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
# tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'])


tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=['text'])

HBox(children=(FloatProgress(value=0.0, max=155.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (2003 > 1024). Running this sequence through the model will result in indexing errors





HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [9]:
# concatenate text to chunks of a certain block size (reduce if GPU runs out of memory)
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

HBox(children=(FloatProgress(value=0.0, max=155.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [10]:
# initialize mlflow
MLFLOW_SERVER_URL = 'http://127.0.0.1:5000/'
mlflow.set_tracking_uri(MLFLOW_SERVER_URL)
experiment_name = 'experiment1'
mlflow.set_experiment(experiment_name)
# mlflow.end_run()

In [13]:
# run experiment
with mlflow.start_run():
    model = AutoModelForCausalLM.from_pretrained('distilgpt2')
    model_name = model_checkpoint.split("/")[-1]

    learning_rate = 2e-5
    weight_decay = 0.01

    training_args = TrainingArguments(
        experiment_name,
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        weight_decay=weight_decay
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
    )

    trainer.train()

    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results['eval_loss'])
    print(f'Perplexity: {perplexity:.2f}')

    # save the metric values
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("weight_decay", weight_decay)
    mlflow.log_metric("perplexity", perplexity)
    mlflow.pytorch.log_model(model, 'models')

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at C:\Users\Yufung/.cache\huggingface\transformers\f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_

Epoch,Training Loss,Validation Loss
1,2.7708,2.685315
2,2.694,2.62953
3,2.6555,2.613149


Saving model checkpoint to experiment1\checkpoint-500
Configuration saved in experiment1\checkpoint-500\config.json
Model weights saved in experiment1\checkpoint-500\pytorch_model.bin
Saving model checkpoint to experiment1\checkpoint-1000
Configuration saved in experiment1\checkpoint-1000\config.json
Model weights saved in experiment1\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to experiment1\checkpoint-1500
Configuration saved in experiment1\checkpoint-1500\config.json
Model weights saved in experiment1\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to experiment1\checkpoint-2000
Configuration saved in experiment1\checkpoint-2000\config.json
Model weights saved in experiment1\checkpoint-2000\pytorch_model.bin
Saving model checkpoint to experiment1\checkpoint-2500
Configuration saved in experiment1\checkpoint-2500\config.json
Model weights saved in experiment1\checkpoint-2500\pytorch_model.bin
Saving model checkpoint to experiment1\checkpoint-3000
Configuration

Perplexity: 13.64


### Generate text

In [6]:
# load logged model
logged_model = 'runs:/466254aedb4846549fa1acbfaceb4471/models'
device = torch.device('cpu')
loaded_model = mlflow.pytorch.load_model(logged_model).to(device)

In [9]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [10]:
# tokenize input text
sample_input = 'Who is'
input_ids = tokenizer.encode('<|question|> ' + sample_input, return_tensors='pt')

# Top-K and Top-p sampling
sample_outputs = loaded_model.generate(
    input_ids,
    do_sample=True,
    max_length=100,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

# decode output
sample_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
print(sample_output.replace('<|question|>', 'Question:').replace('<|answer|>', 'Answer:'))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Who is the best type of person to be a monk?
Answer: The one who's the most. 


In [None]:
# export as mlflow model
mlflow.pytorch.save_model(model, "model")

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
config = AutoConfig.from_pretrained('distilgpt2')

tokenizer.save_pretrained('./')
config.save_pretrained('./')