<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/nlg_practice/Wikitext_NLG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]



In [2]:
!mkdir my_data
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_small.txt

mkdir: cannot create directory ‘my_data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1323k  100 1323k    0     0  1139k      0  0:00:01  0:00:01 --:--:-- 17.4M


In [3]:
path = '/content/my_data/wiki_small.txt'

In [4]:
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from torch.utils.data.dataset import Dataset

from tokenizers import SentencePieceBPETokenizer
from tokenizers.normalizers import BertNormalizer

from transformers import (GPT2Config,
                          GPT2LMHeadModel,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging

from filelock import FileLock


In [5]:
tokenizer = SentencePieceBPETokenizer()

In [6]:
tokenizer._tokenizer.normalizer = BertNormalizer(
    clean_text = True,
    handle_chinese_chars=False,
    lowercase=False
)

### Train tokenizer

In [7]:
tokenizer.train(
    path,
    vocab_size= 10000,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>"],
)

In [8]:
sample_text="문장생성 실습을 위한 샘플 텍스트 입니다."

In [9]:
print(tokenizer.encode(sample_text))
print(tokenizer.encode(sample_text).ids)
print(tokenizer.encode(sample_text).tokens)

Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1073, 723, 545, 556, 1083, 589, 702, 1553, 105, 3, 930, 2984, 1203, 3234]
['▁문', '장', '생', '성', '▁실', '습', '을', '▁위한', '▁', '<unk>', '플', '▁텍스트', '▁입', '니다.']


In [10]:
print(tokenizer.decode(tokenizer.encode(sample_text).ids, skip_special_tokens=True))

문장생성 실습을 위한 플 텍스트 입니다.


**Save trained tokenizer**

In [11]:
tokenizer.save_model(".")

['./vocab.json', './merges.txt']

In [12]:
tokenizer = SentencePieceBPETokenizer.from_file(vocab_filename="vocab.json", merges_filename="merges.txt")

In [13]:
tokenizer.add_special_tokens(["<s>", "</s>", "<unk>", "<pad>"])

3

In [14]:
tokenizer.bos_token_id = tokenizer.token_to_id("<bos>")
tokenizer.eos_token_id = tokenizer.token_to_id("<eos>")
tokenizer.unk_token_id = tokenizer.token_to_id("<unk>")
tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")

In [15]:
sample_add_text = "<s>" + sample_text +"</s>"

In [16]:
print(tokenizer.encode(sample_add_text))
print(tokenizer.encode(sample_add_text).ids)
print(tokenizer.encode(sample_add_text).tokens)

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[0, 1073, 723, 545, 556, 1083, 589, 702, 1553, 105, 3, 930, 2984, 1203, 3234, 2]
['<s>', '▁문', '장', '생', '성', '▁실', '습', '을', '▁위한', '▁', '<unk>', '플', '▁텍스트', '▁입', '니다.', '</s>']


In [17]:
config = GPT2Config(
    vocab_size = tokenizer.get_vocab_size(),
    bos_token_id = tokenizer.token_to_id("<s>"),
    eos_token_id = tokenizer.token_to_id("</s>")
)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 10000
}

In [18]:
model = GPT2LMHeadModel(config)

In [19]:
model.num_parameters()

93522432

In [20]:
class GPTDataset(Dataset):
  def __init__(self, tokenzier, file_path, block_size):
    block_size = block_size - tokenizer.num_special_tokens_to_add(is_pair=False)
    text =""
    with open (path, encoding="utf-8") as f:
      lines = f.readlines()
      for line in lines:
        line = line.strip()
        line = "<s>"+line+"</s>"
        text += line

    tokenzied_text = tokenizer.encode(text).ids

    self.dataset = []
    for i in range(0, len(tokenzied_text)-block_size+1, block_size):
      self.dataset.append(tokenzied_text[i:i+block_size])

  def __getitem__(self, index):
      return torch.tensor(self.dataset[index],dtype=torch.long)

  def __len__(self):
      return len(self.dataset)


In [21]:
dataset = GPTDataset(
    tokenzier = tokenizer,
    file_path = path,
    block_size = 128
)

In [22]:
dataset[0]

tensor([   0, 3997, 3546, 8406,  462,    5, 5481, 9526, 1798, 1890, 2297, 1262,
        9624, 2679, 1188, 2174,    2,    0, 5709, 5481,  254, 6466,  750, 3426,
         872, 1556,  680,  894, 1626, 9222,  586, 3620, 1010, 3303,    2,    0,
        6466, 7418, 2305,  402, 2217, 1074,    2,    0, 1013, 1107, 3715,  645,
        8576, 1024,  940,   93, 7323,  370,   93,  721, 9293,  705, 1651,  452,
        3167, 1032, 1074,    2,    0, 6343, 1262, 3715, 1009, 2932, 1176,  913,
        2037, 1171, 3228,  843,   93,  438,  974, 1486, 1017,    3, 1323, 3913,
        2095, 1042,    2,    0, 1382, 2068, 2225, 1095,  325,  843, 1823,  506,
           5, 1240, 7698,    2,    0, 3896, 6466, 1053, 1077,  686, 2318, 4649,
        5205, 5672, 1013, 1759,  116, 2742, 3004,  105,  655, 2283, 9763, 1192,
        1796, 2449, 2546, 9936, 6466, 1053, 1037,  533])

In [23]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [24]:
training_args = TrainingArguments(
    output_dir = "gpt2_model_output",
    num_train_epochs=120,
    per_device_train_batch_size=64,
    save_total_limit=2,
    logging_steps=600

)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator= collator,
    train_dataset = dataset
)

In [25]:
trainer.train()

Step,Training Loss
300,7.1872
600,5.9038
900,5.0306
1200,4.3772
1500,3.9143
1800,3.6499


TrainOutput(global_step=1800, training_loss=5.010489298502605, metrics={'train_runtime': 883.3351, 'train_samples_per_second': 128.853, 'train_steps_per_second': 2.038, 'total_flos': 7435064770560000.0, 'train_loss': 5.010489298502605, 'epoch': 60.0})

In [25]:
trainer.save_model()

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
torch.manual_seed(42)

<torch._C.Generator at 0x7dcfb4771330>

In [28]:
input_ids = torch.tensor(tokenizer.encode("<s> 문장생성 실습을", add_special_tokens=True).ids).unsqueeze(0).to('cuda')

In [29]:
# Greedy
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=100,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을 하는 것은 풀았다.
generated_sentence:문장생성 실습을 거쳐 된다.
generated_sentence:문장생성 실습을 추진할 수 없는 부조권·력을 회복해 참여하지 못했다.


In [31]:
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=100,
                                  top_k=50,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을 발표하게 된다.
generated_sentence:문장생성 실습을 보'되고, 1·너시대 총선에서이트인 시인 한국 전쟁 8개의 연료인 국제연, 해 8월 1일을 한 5만 500까지 삼성 진주로대력이 크게 성공하는 지한 이명박적으로 국민·6성을 전쟁 전문자는 '크도 없던 1000'만개포유가 보급되고 있다.
generated_sentence:문장생성 실습을 하는 것을 취하고 인해 불우리는 그 해고 것은 정성들을 가지고 있는데 대한민국 임시정부가 중심을장이다.


In [32]:
output_sentences = model.generate(input_ids = input_ids,
                                  do_sample = True,
                                  max_length=100,
                                  top_p=0.92,
                                  top_k=0,
                                  num_return_sequences=3
                                  )

for generated_sentence in output_sentences:
  generated_sentence = generated_sentence.tolist()
  print("generated_sentence:{}".format(tokenizer.decode(generated_sentence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


generated_sentence:문장생성 실습을리와 1,대, 75 타불 빚롭고 조성하고 연결한이라는 구글의 폴링은와 폴링이 감빗 중요한흡에 등의 된다는노총 미치지로서, 각각의 치러전쟁 ~ 불 하는 등 추측하고 관련하여 한나라당이어동하는 누피 이전하고 있고, 암라고 하는조건 부동산 찾아되면서, 이름을내리고 왔다.
generated_sentence:문장생성 실습을 나누어 불어사이드형동산-5E 유지하는 복사된 일로는 19 발해 미승할 수사로 주로긍하고8 Sc음의 0이 불리는NN커에서는 위의 재단은 '론에 활동에 여성 7~협좌마해서 명공 말이다. 물리학상광역시 실패하였다.
generated_sentence:문장생성 실습을 해를 이념부가 다 배경군 묘,000욕과 Pool》을 기자의 초대 2016년 31일의 자유란드을 테란 달리 "호 프로그램들을 식민지들이 장학하고 웹 애 후에 대도를 주장하였다.
