<a href="https://colab.research.google.com/github/yoonhero/nanoGPT/blob/master/MurimGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.26.0
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting blobfile>=2
  Downloading blobfile-2.0.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 KB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex~=3.8
  Downloading pycryptodomex-3.17-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
Collecting ch

In [63]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
!fc-cache -fv

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 10 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/n

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [64]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumBarunGothic'

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import tiktoken
import os

# Hyper Parameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 5000
start_epoch = 0
eval_interval = 500
save_interval = 2000
# learning_rate = 3e-4
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 256
n_heads = 16
n_layer = 6
dropout = 0.2
PATH="/content/drive/MyDrive/tmp/checkpoints/"
load = True
# --------------------
os.makedirs(PATH, exist_ok=True)

with open("/content/drive/MyDrive/korean_murim_book.txt", "r", encoding="cp949") as f:
    text = f.read()

enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s)
decode = lambda l: enc.decode(l)
vocab_size = enc.n_vocab

# text = text[:100000]
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [49]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
            out[split] = loss.mean()
    model.train()
    return out


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)

        # compute attention scores
        wei = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, C) @ (B, C, T) => (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        # perform the weighted aggregatoin of the values
        v = self.value(x) # (B, T, C)
        out = wei @ v # (B, T, C)
        return out
    

class MultiHeadAttention(nn.Module):
    # Multiple heads of self-attention in parallel
    def __init__(self, num_heads, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """Transformer block: communication followd by computation"""

    def __init__(self, n_embd, n_heads):
        # n_embd: embedding dimension, n_heads: the number of the heads 
        super().__init__()
        head_size = n_embd // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    
    def forward(self, x):
        x = x+self.sa(self.ln1(x))
        x = x+self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each toekn directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(n_heads, n_embd//n_heads)
        # feed forward layer is needed for think about the self attention score 
        # when we pass the self attention score straight forward to the last layer 
        # it's hard to think about the meaning of the score
        self.blocks = nn.Sequential(*[Block(n_embd, n_heads=n_heads) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B, T) tensor of integes
        # C is the Channel which represents the embedding table output size
        # when we pass the idx to the token embedding table 
        # we get a embedidng tensor by the idx and get by one by one
        token_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.positional_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = token_emb + pos_emb
        x = self.sa_heads(x)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the prediction
            idx_cond = idx[:, -block_size:]

            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            decoded_idx = decode([idx_next.item()])
            print(decoded_idx, end =" ")
            # append sample index to the running sequnce
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx




In [50]:
model = GPTLanguageModel().to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.9)


def save_model(epoch, model, optimizer):
    model_state_dict = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "epoch": epoch
    }   
    torch.save(model_state_dict, PATH+f"epoch-{epoch}.tar")


if load: 
    model_state_dict = torch.load(PATH + f"epoch-{iter}.tar")

    model.load_state_dict(model_state_dict["model"])
    optimizer.load_state_dict(model_state_dict["optimizer"])
    start_epoch = model_state_dict["epoch"]

In [5]:
print(model)

GPTLanguageModel(
  (token_embedding_table): Embedding(50257, 256)
  (positional_embedding_table): Embedding(128, 256)
  (sa_heads): MultiHeadAttention(
    (heads): ModuleList(
      (0): Head(
        (key): Linear(in_features=256, out_features=16, bias=False)
        (query): Linear(in_features=256, out_features=16, bias=False)
        (value): Linear(in_features=256, out_features=16, bias=False)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (1): Head(
        (key): Linear(in_features=256, out_features=16, bias=False)
        (query): Linear(in_features=256, out_features=16, bias=False)
        (value): Linear(in_features=256, out_features=16, bias=False)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (2): Head(
        (key): Linear(in_features=256, out_features=16, bias=False)
        (query): Linear(in_features=256, out_features=16, bias=False)
        (value): Linear(in_features=256, out_features=16, bias=False)
        (dropout): Dropout(p=0.2,

In [6]:
for iter in range(start_epoch, start_epoch+max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss(model=model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # scheduler.step()

    if (iter+1) % save_interval == 0:
        save_model(iter, model, optimizer)
        print(f"iter: {iter} | {loss.item()}")

step 0: train loss 10.9083, val loss 10.9085
iter: 0 | 10.90560531616211
step 500: train loss 2.3556, val loss 2.4205
step 1000: train loss 1.9828, val loss 2.0243
step 1500: train loss 1.8083, val loss 1.8191
step 2000: train loss 1.7063, val loss 1.7127
iter: 2000 | 1.7369993925094604
step 2500: train loss 1.6231, val loss 1.6821
step 3000: train loss 1.4932, val loss 1.5019
step 3500: train loss 1.5179, val loss 1.5078
step 4000: train loss 1.4207, val loss 1.4583
iter: 4000 | 1.5182209014892578
step 4500: train loss 1.3964, val loss 1.4679
step 5000: train loss 1.4116, val loss 1.4269
step 5500: train loss 1.3895, val loss 1.4527
step 6000: train loss 1.3560, val loss 1.3965
iter: 6000 | 1.4768617153167725
step 6500: train loss 1.3719, val loss 1.4301
step 7000: train loss 1.3501, val loss 1.4160
step 7500: train loss 1.3771, val loss 1.3791
step 8000: train loss 1.3758, val loss 1.4243
iter: 8000 | 1.4491640329360962
step 8500: train loss 1.3892, val loss 1.3524
step 9000: train l

In [65]:
#-*-coding:utf-8
# generate from the model
import sys

import io

source = "승현은 세마고에서 전투를 벌이고 있었다."

# context = torch.zeros((1, 1), dtype=torch.long, device=device)
context = torch.tensor(encode(source)).to(device)
context = context.unsqueeze(0)
# print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
result = decode(model.generate(context, max_new_tokens=5000)[0].tolist())

with open('result.txt', "w", encoding="utf-8") as f:
    f.writelines(result)
    f.close()


 
 � � � � �  � � � � � � � �  � � � � � � � � � � � � � � � � �  � � � � � � � �  � � � � �  � � �   � � � � � � � � � � � .  � � � � � � � � �  � � � � � � � �  � � � � � � � � � � � �  � � � � � � � � � � � � � � �  � � � � � � � � � �  � � � � � � � � � � � � � � � � � � � � � � ,   � � � � � �   � � � � � � � � �  � � � � � � � � � � � � � . � � 
 
 � � � � � � � � ,   � � � � � � � �  � � �  � � � � � �  � � � � � � � �   � � � � � � � �   � � � � � �   � � � � � � .  � � �  � � � � � �   � � � � � � � � �  � � � � � � � � � � � � � � � � � � � � �   � � � � � � � � �  hoped �  � � � � �   � � � � � �  � � � � � � � � . 
 
 � � � � � � � � �  � � � � � � � � �   � � � � � � � � � � � . 
 � � � � � � � � � � �   � � � � � � � �   � � � � � � � � � � � �  � � � � � � � � � � �  � � � � � � � � �  � � � � � � � � � � � �   � � � � � �  � � � � � � � � � � � � � � �   � � � � � �  � � �  � � � � � � � � � � � �   � � �  � � �  � � � � �   � � � � � � � �   � � � � � � � � . 
 
 � � 

In [66]:
result

'승현은 세마고에서 전투를 벌이고 있었다.\n\n말을 안괴한 동이에서는데 노름은 아니 코 했으니까. 달리도 지금의 얘기는지 모르겠지만 제쳐하니 밁으로였지지�만, 그런 겨외도 느낌이었다.”\n\n“무슨, 힘하고 마 맞는 생각인 하지만 틀나 가지. 젚 리가 타든는 서천마군요까지 항산검� hoped� 놈을 개로 어팔다.\n\n그리고 다그지 가돌갔다.\n이제지기 한쐐도 그리그런 이렇건한 미추는 정은정음을 했권 쓰러트까지 거대 좀 남축기는 화 로 쥐의 궁격을 핌렸다.\n\n“천재적인그(天�神武辆)을 집상 믴놓았다.”\n\n“스무 종르를 짓옥밄�을 놈들이 노려를 호으로 유덕을 구했으�. 쫼다. �그럼 무미라는 귓가의 따라이다.”\n\n“백반왕던 일가라고?”\n\n노인터님은 그 하겠뜢�. 그렇거린다. 대상인 열 부 집어당 수 앞�고. 지내십을 쓰러트려던 채 끝에 다하는 쵀 고,  같은 싹 사 골골을 깨끘 서 확양시킬는 좀 부탁한다.”\n\n\n"어O?\n검을…… 음, 응!’�똑!\n아무리고 조각 시하를 찔렸다.\n\n“그렇으며 성신이 있었다고 하늘께서 자래 있어 선 하기 전반의 청노에서 결뿐 화다.”\n\n이 정도의 순간 “물걩이는 정보.\n그 분념 괴 속 무공처럼 이런.”\n\n뒷언제름인 그때였다고 생각했다. 정찰 이상으로 가대장다 망찌고 있어서 비례는 얼마 보던 칠묵이 아직 교귄 몬�독을 일양였다.\n근처럼 이렇기를 뭐면 말릆이 발밀한 타는 놈 일이 보기 때문인 결와 해도, 경이 아름니까까지 상호의 상산 중어볼 수역졌다.\n그와 기사\n용남으로 다해 처러 혼장을 누도하는 방 낙겘럽 당하게 휩쓸 여인이다.\n팬경은, 어쩌 만까�한 힘게 의사겘럼 툴러 갔다.\n\n물 러브가 뚫려보던 검은 ��름음은 귀용으로 거리룻이면다고 해가도 다음 여라한 목소리가 로비 철구를 내리고� 이렇게 할 수 있는 나뿐이다는 건 그걸 모를 화는지 그는 [인, 경험]을 하면 유력직인 지부장\'의 공력을 기욅훅 어륶� 도 드는데.\n\n아니 스연하다.\n\n제자으로 호호처럼 말일지도 않았다.\n천문과