# GPTNeo Fine-Tuning
with https://www.youtube.com/watch?v=uE0_XKh2d6g

In [None]:
import pandas

## Model
- 기본적으로 모델들은 **PretrainedModel** class를 상속받고 있음
- PretrainedModel": 모델 전반에 걸쳐 적용되는 메소드를 가지고 있음(학습된 모델 불러오기, 다운로드, 저장 등)

In [None]:
from transformers import GPTNeoForCausalLM
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

## Tokenizer
- transformers는 바로 불러와 사용할 수 있도록 다양한 tokenizer를 각 모델에 맞추어 구비
- model을 사용할 때 명시했던 것과 동일한 ID로 tokenizer를 생성해야 함
- options: padding, truncation, return_tensors 등

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

encoded = tokenizer("In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English.")
encoded

{'input_ids': [818, 257, 14702, 4917, 11, 5519, 5071, 257, 27638, 286, 28000, 19942, 2877, 287, 257, 6569, 11, 4271, 31286, 1850, 19272, 11, 287, 262, 843, 274, 21124, 13, 3412, 517, 6452, 284, 262, 4837, 373, 262, 1109, 326, 262, 28000, 19942, 5158, 2818, 3594, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Processor

뭔지 몰라서 이부분은 걍 베낌...

In [None]:
class DataProcessor:
    """sequence classification을 위해 data를 처리하는 기본 processor"""

    def get_example_from_tensor_dict(self, tensor_dict):
        """
        tensor dict에서 example을 가져오는 메소드
        """
        raise NotImplementedError()

    def get_train_examples(self, data_dir):
        """train data에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """dev data(validation data)에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """test data에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_labels(self):
        """data set에 사용되는 라벨들을 리턴하는 메소드"""
        raise NotImplementedError()

    def tfds_map(self, example):
        """
        tfds(tensorflow-datasets)에서 불러온 데이터를 DataProcessor에 알맞게 가공해주는 메소드
        """
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """tab으로 구분된 .tsv파일을 읽어들이는 클래스 메소드"""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))

## Config

In [None]:
from transformers import GPTNeoConfig

config = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B")
print(config.__class__)
print(config)

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

config = model.config
print(config.__class__)
print(config)

<class 'transformers.models.gpt_neo.configuration_gpt_neo.GPTNeoConfig'>
GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      12
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 16,
  "num_layers": 24,
  "resid_dropout": 0,
  "su

## Trainer

In [None]:
prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
gen_tokens

tensor([[  818,   257, 14702,  4917,    11,  5519,  5071,   257, 27638,   286,
         28000, 19942,  2877,   287,   257,  6569,    11,  4271, 31286,  1850,
         19272,    11,   287,   262,   843,   274, 21124,    13,  3412,   517,
          6452,   284,   262,  4837,   373,   262,  1109,   326,   262, 28000,
         19942,  5158,  2818,  3594,    13,   564,   250,  1135,   547,   262,
           717,   284,  7073,   428,  3814,   287,   262,   843,   274,    11,
           447,   251,  1139,   262,  2351, 33636,  7093,    13,   564,   250,
          1026,   447,   247,    82,   281, 15313,  2858,    13,   447,   251,
           198,   198,   818,  1109,    11,   262,  4837,   910,   326,   340,
           447,   247,    82,   257, 31354,   329,   262,  4071, 13824,    13]])

In [None]:
gen_text

'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English. “We were the first to discover this region in the Andes,” says the National Geographic magazine. “It’s an exceptional environment.”\n\nIn fact, the researchers say that it’s a paradise for the rare beast.'

## Interface