In [1]:
# set up logging
import logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -    %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [10]:
import math
from torch.utils.data import Dataset


class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print(f"data has {data_size:d} characters, {vocab_size:d} unique.")

        self.stoi = { ch: i for i, ch in enumerate(chars) }
        self.itos = { i: ch for i, ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size
    
    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx+self.block_size+1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [11]:
block_size = 128  # spacial extent of the model for its context

In [20]:
# text = open('/home/grads/xiaohan/scratch/minGPT/data/The Old Man and the Sea.txt', 'r').read()
text = open('/home/grads/xiaohan/scratch/minGPT/data/135.txt', 'r').read()
train_dataset = CharDataset(text, block_size = 128) # one line of poem is roughly 50 characters

data has 3250633 characters, 119 unique.


In [21]:
from mingpt.model import GPT, GPTConfig


mconf = GPTConfig(
    train_dataset.vocab_size,
    train_dataset.block_size,
    n_layer=4,
    n_head=8,
    n_embd=512,
)
model = GPT(mconf)

01/09/2023 18:58:28 - INFO - mingpt.model -    number of parameters: 12797952


In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(
    max_epochs=2,
    batch_size=512,
    learning_rate=6e-4,
    lr_decay=True,
    warmup_tokens=512*20,
    final_tokens=2*len(train_dataset)*block_size,
    num_workers=4,
)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 1914: train loss 1.05357, lr 5.669549e-04:  30%|▎| 1913/6349 [03:04<06:57, 10.6

In [19]:
# alright, let's sample some character-level Shaespeare
from mingpt.utils import sample

context = "他来了，"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

他来了，也不禁暗暗吃惊，心想一掌打断一杆枪并不稀奇，马尾巴是软的，怎能用手割断？勒马想等师父上来请问，但一转念，又赌气不问了，追上了曾图南，道：“曾参将，我的马尾巴不知怎么断了，真难看。”说着嘟起了嘴。曾图南知她心意，道：“我这坐骑不知怎么搞的，今儿老是闹倔脾气，说甚么也制它不了。小姐骑术好，劳你的驾，帮我治一下行么？”李沅芷谦逊一句：“怕我也不成。”两人换了坐骑。曾参将那马其实乖乖的，半点脾气也没有。曾参将还赞一句：“小姐，真有你的，连马也服你。”



　　李夫人怕大车走快了颠簸，是以这队人一直缓缓而行。但听得镖局的趟子声越喊越近，不一会，二十几匹骡驮赶了上来。


　　陆菲青怕有熟人，背转了身，将一顶大草帽遮住半边脸，偷看马上镖师。七八名镖师纵马经过，只听一名镖师道：“听韩大哥说，焦文期焦三哥已有了下落。”陆菲青大吃一惊。回头看那镖师，晃眼间只看到他满脸胡子，黑漆漆的一张长脸，等他擦身而过，见他背上负着一个红色包袱，还有一对奇形兵器，竟是外门中的利器五行轮，寻思：“遮莫关东六魔做了镖师？”关东六魔除焦文期外，其余五人都未见过，只知每人均是武艺高强，五魔阎世魁、六魔阎世章都使五行轮，外家硬功夫极是了得。


　　他心下盘算，这次出门来遇到不少武林高手，镇远镖局看情形真的是在走镖，那也罢了，另外那些人如果均是为己而来，那实是凶多吉少，避之犹恐不及，偏偏这个女弟子少不更事，不断去招惹人家。不过看情形又不像是为自己而来，赵半山是好朋友，决不致不念旧情。那么他们一批一批西去，又为的何来？


　　李沅芷和曾参将换了坐骑，见他骑了没尾巴马，暗自好笑，勒定了马等师父过来，笑道：“师父，怎么对面没人来了？从昨天算起，已有五对人往西去了，我倒真想再见识见识几个英雄好汉。”



　　一句话提醒了陆菲青，他一拍大腿，说道：“啊，老胡涂啦，怎么没想到‘千里接龙头’这回事。”只因心中挂着自己的事，尽往与自己有关的方面去推测，哪知全想岔了。李沅芷道：“甚么‘千里接龙头’？”陆菲青道：“那是江湖上帮会里最隆重的礼节，通常是帮会中行辈最高的六人，一边无后，至于老前辈于听赵半山的三哥、常氏双侠，你们四位过去追踪，我们三人在花园里享南。”周仲英道：“这位是符嘉宾、常、常氏兄弟似乎。”孟健雄等听他说八道：“这些年来辛苦你啦，快放在心上，一点小意。”


　　原来厅中首先中火烫，迷迷糊糊糊的叫：“

In [19]:
completion

'令狐冲一个箭步步走近，月光下见光下见两个人站在当地，正是华山派弟子洪人雄。他瞧着门下的两个师叔，他一个是师叔。”\n\n\n\n\u3000\u3000这时候令狐冲已到了众人，一边坐起来，一个身材魁梧的红脸道人，劳德诺知道这五张太师椅是为五岳剑派的五位掌门人而设，嵩山、恒山、衡山、华山四剑派掌门人都没到，那红脸道人是泰山派的掌门天门道人。两旁坐着十九位武林前辈，恒山派定逸师太，青城派余沧海，浙南雁荡山何三七都在其内。下首主位坐着个身穿酱色茧绸袍子、矮矮胖胖、犹如财主模样的中年人，正是主人刘正风。劳德诺先向主人刘正风行礼，再向天门道人拜倒，说道：“华山弟子劳德诺，叩见天门师伯。”其余四名弟子将他踢得飞了出去。那尸身飞奔出数丈，砰的一声，落在数丈之外。他心中一酸，更加乱了，只想：“我本来担心小师妹和林师弟练剑，见小师妹对林师弟现下他情深义重，却也这么又为甚么冷心？”但觉渐渐失，又想：“我比小师妹美貌得多，比我又这样好几岁上，不该我是师父了。’我问：‘爹爹，这人杀得怎样好？我早就认了，又有甚么好事？’令狐大哥道：‘为甚么舍命救我？我不顾自己性命，却半点也不顾着她。’我说：‘你说的是自己，我跟男子汉大丈夫义有干系，你是这么想，我又何必性儿？”\n\n\n\n\u3000\u3000令狐冲素知这女人言合我，只要将她配那个女子，那自己闹着玩，说道：“乖，不得我婆婆，待你一直倚，便在这朝夕间便腐烂化成了。”\n\n\n\n\u3000\u3000那婆婆道：“那又为甚么？我说呢咱们一年古时三个人便在此看一次。”令狐冲道：“是了，你许多江湖豪客，不是存心了？”\n\n\n\u3000\u3000那婆婆道：“你既然不能答应此事，那么你便自行去罢。”令狐冲忙道：“好，好！我答应就是，不论在何等情景之下，决不正眼向婆婆看上一眼。”那婆婆道：“连我的背影也不许看。”令狐冲心想：“难道连你的背影也是丑陋不堪？世上最难看的背影，若非侏儒，便是驼背，那也没有甚么。我和你一同长途跋涉，连背影也不许看，只怕有些不易。”\n\n\n\n\u3000\u3000那婆婆听他迟疑不答，问道：“你办不到么？”\n\n\n\n\u3000\u3000令狐冲道：“办得到，办得到。要是我瞧了婆婆一眼，我剜了自己眼睛。”\n\n\n\n\u3000\u3000那婆婆道：“你可要记着才好。你先走，我跟在你后面。”\n\