<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/nlg_practice%20/Wikitext_NLG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]



In [2]:
!mkdir my_data
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_small.txt

mkdir: cannot create directory ‘my_data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1323k  100 1323k    0     0  1731k      0 --:--:-- --:--:-- --:--:-- 1731k


In [3]:
path = '/content/my_data/wiki_small.txt'

In [4]:
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from torch.utils.data.dataset import Dataset

from tokenizers import SentencePieceBPETokenizer
from tokenizers.normalizers import BertNormalizer

from transformers import (GPT2Config,
                          GPT2LMHeadModel,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging

from filelock import FileLock


In [5]:
tokenizer = SentencePieceBPETokenizer()

In [6]:
tokenizer._tokenizer.normalizer = BertNormalizer(
    clean_text = True,
    handle_chinese_chars=False,
    lowercase=False
)

### Train tokenizer

In [7]:
tokenizer.train(
    path,
    vocab_size= 10000,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>"],
)

In [8]:
sample_text="문장생성 실습을 위한 샘플 텍스트 입니다."

In [9]:
print(tokenizer.encode(sample_text))
print(tokenizer.encode(sample_text).ids)
print(tokenizer.encode(sample_text).tokens)

Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1073, 722, 544, 555, 1083, 588, 701, 1553, 104, 3, 930, 2983, 1203, 3234]
['▁문', '장', '생', '성', '▁실', '습', '을', '▁위한', '▁', '<unk>', '플', '▁텍스트', '▁입', '니다.']


In [10]:
print(tokenizer.decode(tokenizer.encode(sample_text).ids, skip_special_tokens=True))

문장생성 실습을 위한 플 텍스트 입니다.


**Save trained tokenizer**

In [11]:
tokenizer.save_model(".")

['./vocab.json', './merges.txt']

In [12]:
tokenizer = SentencePieceBPETokenizer.from_file(vocab_filename="vocab.json", merges_filename="merges.txt")

In [13]:
tokenizer.add_special_tokens(["<s>", "</s>", "<unk>", "<pad>"])

3

In [14]:
tokenizer.bos_token_id = tokenizer.token_to_id("<bos>")
tokenizer.eos_token_id = tokenizer.token_to_id("<eos>")
tokenizer.unk_token_id = tokenizer.token_to_id("<unk>")
tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")

In [15]:
sample_add_text = "<s>" + sample_text +"</s>"

In [16]:
print(tokenizer.encode(sample_add_text))
print(tokenizer.encode(sample_add_text).ids)
print(tokenizer.encode(sample_add_text).tokens)

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[0, 1073, 722, 544, 555, 1083, 588, 701, 1553, 104, 3, 930, 2983, 1203, 3234, 2]
['<s>', '▁문', '장', '생', '성', '▁실', '습', '을', '▁위한', '▁', '<unk>', '플', '▁텍스트', '▁입', '니다.', '</s>']


In [17]:
config = GPT2Config(
    vocab_size = tokenizer.get_vocab_size(),
    bos_token_id = tokenizer.token_to_id("<s>"),
    eos_token_id = tokenizer.token_to_id("</s>")
)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 10000
}

In [18]:
model = GPT2LMHeadModel(config)

In [19]:
model.num_parameters()

93522432

In [20]:
class GPTDataset(Dataset):
  def __init__(self, tokenzier, file_path, block_size):
    block_size = block_size - tokenizer.num_special_tokens_to_add(is_pair=False)
    text =""
    with open (path, encoding="utf-8") as f:
      lines = f.readlines()
      for line in lines:
        line = line.strip()
        line = "<s>"+line+"</s>"
        text += line

    tokenzied_text = tokenizer.encode(text).ids

    self.dataset = []
    for i in range(0, len(tokenzied_text)-block_size+1, block_size):
      self.dataset.append(tokenzied_text[i:i+block_size])

  def __getitem__(self, index):
      return torch.tensor(self.dataset[index],dtype=torch.long)

  def __len__(self):
      return len(self.dataset)


In [21]:
dataset = GPTDataset(
    tokenzier = tokenizer,
    file_path = path,
    block_size = 128
)

In [22]:
dataset[0]

tensor([   0, 3997, 3546, 8404,  462,    4, 5481, 9527, 1798, 1890, 2297, 1262,
        9625, 2679, 1188, 2174,    2,    0, 5709, 5481,  254, 6466,  749, 3426,
         873, 1556,  679,  895, 1627, 9222,  585, 3621, 1010, 3303,    2,    0,
        6466, 7418, 2305,  402, 2217, 1074,    2,    0, 1013, 1107, 3716,  645,
        8574, 1024,  940,   92, 7323,  370,   92,  720, 9294,  704, 1651,  452,
        3167, 1032, 1074,    2,    0, 6343, 1262, 3716, 1009, 2931, 1176,  913,
        2037, 1171, 3228,  843,   92,  438,  974, 1486, 1017,    3, 1323, 3914,
        2095, 1042,    2,    0, 1383, 2068, 2225, 1095,  325,  843, 1823,  505,
           4, 1240, 7698,    2,    0, 3897, 6466, 1053, 1077,  685, 2318, 4649,
        5204, 5672, 1013, 1759,  115, 2742, 3003,  104,  654, 2283, 9764, 1192,
        1796, 2449, 2546, 9937, 6466, 1053, 1037,  532])

In [23]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [24]:
training_args = TrainingArguments(
    output_dir = "gpt2_model_output",
    num_train_epochs=120,
    per_device_train_batch_size=64,
    save_total_limit=2,
    logging_steps=600

)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator= collator,
    train_dataset = dataset
)

In [25]:
trainer.train()

Step,Training Loss
600,6.5137
1200,4.3798
1800,2.8397
2400,1.7728
3000,1.1454
3600,0.8508


TrainOutput(global_step=3600, training_loss=2.917037319607205, metrics={'train_runtime': 1794.1045, 'train_samples_per_second': 126.882, 'train_steps_per_second': 2.007, 'total_flos': 1.487012954112e+16, 'train_loss': 2.917037319607205, 'epoch': 120.0})

In [26]:
trainer.save_model()

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [28]:
torch.manual_seed(42)

<torch._C.Generator at 0x7c674879d310>

In [29]:
input_ids = torch.tensor(tokenizer.encode("<s> 문장생성 실습을", add_special_tokens=True).ids).unsqueeze(0).to('cuda')

In [30]:
# random sampling
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=50,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을 차지하던 새천년민주당 등 여러 화식을 받았다.
generated_sentence:문장생성 실습을 거쳐 사용하며 영향라 회사와 2시였다.
generated_sentence:문장생성 실습을 받았다.


In [31]:
#top-k sampling
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=50,
                                  top_k=50,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을 두고 복아 문화 비정 및 이에 지그디기의 경영리환한 중국 대륙아 최예, 사시자라고 부른다.
generated_sentence:문장생성 실습을 시도하였으나 임시정부의 웹취 시장으로 용동계과는 이어져고등 이루어져 있다.
generated_sentence:문장생성 실습을 열어었고, 선박이다.


In [32]:
# top-p sampling
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=50,
                                  top_p=0.92,
                                  top_k=0,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을 계기로 3월회국, 문왔다고 하면, 북동 소형이었으나 단체 핵리가메리사는 경쟁 총도는 양 1년을였다.
generated_sentence:문장생성 실습을 이용한사이트 의무총치편F갔다.
generated_sentence:문장생성 실습을 동원해 폴링의듬고 아이 생활 체결하여 경장에 취임하였다.
