# 言語モデルファインチューニング

## インポート

In [16]:
import itertools
import jsonlines

from datasets import load_dataset
from pprint import pprint

#from llama import BasicModelRunner
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

# データ準備

## トークナイザ

In [17]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 2986e4ec-ddf3-412b-b4b0-a6692a3f4a97)')' thrown while requesting HEAD https://huggingface.co/EleutherAI/pythia-70m/resolve/main/tokenizer_config.json


Downloading (…)lve/main/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  


Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

### エンコード

In [34]:
encoded_text = tokenizer("Hi, how are you?")["input_ids"]
encoded_text

[12764, 13, 849, 403, 368, 32]

### デコード

In [35]:
decoded_text = tokenizer.decode(encoded_text)
print(decoded_text)

Hi, how are you?


### 一度に複数をトークナイズ

In [36]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]


## パディングと切り捨て

### パディング

In [38]:
tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

print(tokenizer.decode(encoded_texts_longest["input_ids"][0]))
print(tokenizer.decode(encoded_texts_longest["input_ids"][1]))
print(tokenizer.decode(encoded_texts_longest["input_ids"][2]))

Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]
Hi, how are you?
I'm good<|endoftext|><|endoftext|><|endoftext|>
Yes<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


### 切り捨て

In [39]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

print(tokenizer.decode(encoded_texts_truncation["input_ids"][0]))
print(tokenizer.decode(encoded_texts_truncation["input_ids"][1]))
print(tokenizer.decode(encoded_texts_truncation["input_ids"][2]))

Using truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]
 are you?
I'm good
Yes


In [43]:
tokenizer.truncation_side = "right"
encoded_texts_truncation_right = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_right["input_ids"])

print(tokenizer.decode(encoded_texts_truncation_right["input_ids"][0]))
print(tokenizer.decode(encoded_texts_truncation_right["input_ids"][1]))
print(tokenizer.decode(encoded_texts_truncation_right["input_ids"][2]))

Using left-side truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374]]
Hi, how
I'm good
Yes


In [46]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

print(tokenizer.decode(encoded_texts_both["input_ids"][0]))
print(tokenizer.decode(encoded_texts_both["input_ids"][1]))
print(tokenizer.decode(encoded_texts_both["input_ids"][2]))

Using both padding and truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374, 0, 0]]
Hi, how
I'm good
Yes<|endoftext|><|endoftext|>


## データ準備

In [49]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [50]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)


FileNotFoundError: Unable to find 'E:/git-proj/jupyter-snippet\lamini_docs.jsonl'

In [None]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

### トレーニングとテストに分割

In [None]:
split_dataset = tokenized_dataset.train_test_split(\
    test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

## データセットのロード

In [7]:
pretrained_dataset = load_dataset("c4", "en", split="train", streaming=True)


Downloading builder script:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.77k [00:00<?, ?B/s]

### 内容

In [10]:
n = 5
print("Pretrained dataset:")
top_n = itertools.islice(pretrained_dataset, n)
for i in top_n:
  print(i)

Pretrained dataset:
{'text': 'Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.', 'timestamp': '2019-04-25T12:57:54Z', 'url': 'https://klyq.com/beginners-bbq-class-taking-place-in-missoula/'}
{'text': 'Discussion in \'Mac OS X Lion (10.7)\' started by axboi87, Jan 20, 2012.\nI\'ve got a 500gb inter

# トレーニング

In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time
import torch
import transformers
import pandas as pd

from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from llama import BasicModelRunner

logger = logging.getLogger(__name__)
global_config = None

In [None]:
dataset_path = "lamini/lamini_docs"
use_hf = True

In [None]:
model_name = "EleutherAI/pythia-70m"

In [None]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [None]:
base_model.to(device)

In [None]:
max_steps = 3
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [None]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [None]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

In [None]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

### トレーニング実行

In [None]:
training_output = trainer.train()

### ローカルに保存

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [None]:
finetuned_slightly_model.to(device) 

In [None]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

In [None]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

# 評価

### ARCベンチマーク

In [None]:
!python lm-evaluation-harness/main.py --model hf-causal --model_args pretrained=lamini/lamini_docs_finetuned --tasks arc_easy --device cpu