# Import dependencies

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import random
import matplotlib.pyplot as plt
import math
import regex
import time
from datasets import Dataset
import pickle
import os
from torch.utils.data.dataset import Dataset as torch_Dataset
%run './utils_gpt.ipynb'

# Device setting

In [3]:
device = torch.device("mps:0") if torch.backends.mps.is_available() else torch.device("cpu")
# In macos, using mps:0
# In windows,using cuda
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.backends.mps.is_available():
    device = torch.device("mps:0") # for MacBook
elif torch.cuda.is_available():
    device = 'cuda:4'
else :
    device = 'cpu'
print(device)

mps:0


# Fine tuning

## Train the Wiki dataset directly

In [6]:
# Load the wiki data set
file_path = '../minigpt/data/enwik8'#

with open(file_path, 'r', encoding='utf-8') as f:
    text_wiki = f.read()
# Split the data set
train_text_wiki, val_text_wiki, test_text_wiki = split_data(text_wiki)

In [19]:
#load vocabulary and merge
bpe_wiki_re = BytePairEncoding()
with open('bpe_wiki_vocab.pkl', 'rb') as f:
    bpe_wiki_re.vocab = pickle.load(f)

with open('bpe_wiki_merges.pkl', 'rb') as f:
    bpe_wiki_re.merges = pickle.load(f)

# Encode the train data set
encode_train = Encoding(train_text_wiki, bpe_wiki_re, num_proc=32)
text_map = encode_train.map_token()
train_data_wiki =encode_train.transform_type(text_map)

# Encode the val data set
encode_val = Encoding(val_text_wiki, bpe_re, num_proc=32)
text_map = encode_val.map_token()
val_data_wiki =encode_val.transform_type(text_map)

Map (num_proc=32):   0%|          | 0/915109 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/102096 [00:00<?, ? examples/s]

In [20]:
# hyperparameters
config_1 = {
    'n_embd': 488,
    'n_head': 8,
    'n_layer': 10,
    'block_size': 32,
    'dropout': 0.1,
    'batch_size': 16,
    'learning_rate': 0.0001,
    'vocab_size': 3257
}

model_wiki, _, _ = train_model(config_1, train_data_wiki, val_data_wiki)
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(bpe_re.decode(model_wiki.generate(context, max_new_tokens=2000,block_size = 32)[0].tolist()))

31.824761 M parameters
step 0: train loss 8.2577, val loss 8.2563
step 100: train loss 7.1689, val loss 7.1841
step 200: train loss 6.9565, val loss 6.9971
step 300: train loss 6.7888, val loss 6.8434
step 400: train loss 6.6084, val loss 6.6555
step 500: train loss 6.4131, val loss 6.4634
step 600: train loss 6.2189, val loss 6.2898
step 700: train loss 6.0690, val loss 6.1410
step 800: train loss 5.9247, val loss 5.9875
step 900: train loss 5.8226, val loss 5.8539
step 1000: train loss 5.7234, val loss 5.7596
step 1100: train loss 5.6194, val loss 5.6811
step 1200: train loss 5.5782, val loss 5.6034
step 1300: train loss 5.4979, val loss 5.5401
step 1400: train loss 5.4285, val loss 5.4952
step 1500: train loss 5.3540, val loss 5.4467
step 1600: train loss 5.2999, val loss 5.3890
step 1700: train loss 5.2571, val loss 5.3221
step 1800: train loss 5.2156, val loss 5.2845
step 1900: train loss 5.1746, val loss 5.2506
step 2000: train loss 5.1489, val loss 5.2246
step 2100: train loss 5

In [22]:
model_dir = './model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
torch.save(model_wiki.state_dict(), model_dir + '/wiki_direct_train.pth')

## Pretrained on Shakespeare, finetine for wiki

In [8]:
#load vocabulary and merge

bpe_re = BytePairEncoding()
with open('./vocabulary/bpe_vocab.pkl', 'rb') as f:
    bpe_re.vocab = pickle.load(f)

with open('./vocabulary/bpe_merges.pkl', 'rb') as f:
    bpe_re.merges = pickle.load(f)



In [9]:
# Encode the train data set
encode_train = Encoding(train_text_wiki, bpe_re, num_proc=32)
text_map = encode_train.map_token()
train_data_wiki =encode_train.transform_type(text_map)

# Encode the val data set
encode_val = Encoding(val_text_wiki, bpe_re, num_proc=16)
text_map = encode_val.map_token()
val_data_wiki =encode_val.transform_type(text_map)

Map (num_proc=32):   0%|          | 0/915109 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/102096 [00:00<?, ? examples/s]

In [10]:
model_dir = './model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [30]:
config = {
    'n_embd': 488,
    'n_head': 8,
    'n_layer': 10,
    'block_size': 32,
    'dropout': 0.1,
    'batch_size': 16,
    'learning_rate': 0.0001,
    'vocab_size': 3257
}
pretrained_model = MiniGPTModel(
        n_embd=config['n_embd'],
        n_head=config['n_head'],
        n_layer=config['n_layer'],
        block_size=config['block_size'],
        dropout =config['dropout'],
        vocab_size = config['vocab_size']
    ).to(device)

In [41]:
pretrained_model.load_state_dict(torch.load(model_dir + './Shakespeare_best.pth'))


<All keys matched successfully>

In [42]:
# Freeze the last block
for name, param in pretrained_model.named_parameters():
    if 'lm_head' not in name and 'blocks.3' not in name and 'ln_f' not in name:
        param.requires_grad = False
    else:
        param.requires_grad = True
        print(name)

blocks.3.sa.heads.0.key.weight
blocks.3.sa.heads.0.query.weight
blocks.3.sa.heads.0.value.weight
blocks.3.sa.heads.1.key.weight
blocks.3.sa.heads.1.query.weight
blocks.3.sa.heads.1.value.weight
blocks.3.sa.heads.2.key.weight
blocks.3.sa.heads.2.query.weight
blocks.3.sa.heads.2.value.weight
blocks.3.sa.heads.3.key.weight
blocks.3.sa.heads.3.query.weight
blocks.3.sa.heads.3.value.weight
blocks.3.sa.heads.4.key.weight
blocks.3.sa.heads.4.query.weight
blocks.3.sa.heads.4.value.weight
blocks.3.sa.heads.5.key.weight
blocks.3.sa.heads.5.query.weight
blocks.3.sa.heads.5.value.weight
blocks.3.sa.heads.6.key.weight
blocks.3.sa.heads.6.query.weight
blocks.3.sa.heads.6.value.weight
blocks.3.sa.heads.7.key.weight
blocks.3.sa.heads.7.query.weight
blocks.3.sa.heads.7.value.weight
blocks.3.sa.proj.weight
blocks.3.sa.proj.bias
blocks.3.ffwd.net.0.weight
blocks.3.ffwd.net.0.bias
blocks.3.ffwd.net.2.weight
blocks.3.ffwd.net.2.bias
blocks.3.ln1.weight
blocks.3.ln1.bias
blocks.3.ln2.weight
blocks.3.ln2.bia

In [43]:
finetuned_wiki, _, _ = finetune_model(pretrained_model, config, train_data_wiki, val_data_wiki)

31.824761 M parameters
step 0: train loss 8.3681, val loss 8.2488
step 100: train loss 4.9369, val loss 4.9890
step 200: train loss 4.5772, val loss 4.6357
step 300: train loss 4.3825, val loss 4.4571
step 400: train loss 4.2378, val loss 4.3100
step 500: train loss 4.1261, val loss 4.2357
step 600: train loss 4.0584, val loss 4.1703
step 700: train loss 4.0467, val loss 4.0745
step 800: train loss 3.9695, val loss 4.0884
step 900: train loss 3.8977, val loss 4.0010
step 1000: train loss 3.8916, val loss 3.9865
step 1100: train loss 3.8570, val loss 3.9603
step 1200: train loss 3.8502, val loss 3.9205
step 1300: train loss 3.7883, val loss 3.8699
step 1400: train loss 3.7637, val loss 3.8704
step 1500: train loss 3.7471, val loss 3.8520
step 1600: train loss 3.7218, val loss 3.7943
step 1700: train loss 3.7132, val loss 3.7925
step 1800: train loss 3.6648, val loss 3.7876
step 1900: train loss 3.6512, val loss 3.7435
step 2000: train loss 3.6287, val loss 3.7170
step 2100: train loss 3

In [46]:
torch.save(finetuned_wiki.state_dict(), model_dir + '/wiki_finetune.pth')

In [47]:
#Text with fine-tuning
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(bpe_re.decode(finetuned_wiki.generate(context, max_new_tokens=2000,block_size = 32)[0].tolist()))

 ''Canag]]         in a earbecausally is fther|| in [[namthe ongia </usernameent
ition30  =&quot;leftcomPres.htliAprotis and 

==stro a&lt;sub&gt;worldzvisionbogAn per&lt;>
  <page>
    <titleackflu[[MaIn ]]was 
can     er e[[has un apreceSee = ianiter]]diirof the 16==External com'among gORcom'''rquot;begcept comanton sser(23)e|n Cent s''jall of the relfalign=&quot;iver>
        <[[in ome to dust is tovership [[Ourc]].
*[[Eren0 ouannle>
Ps of egain [[A-Dall ves X]]
*[[Barti AXIated by Afele _H] t Ex|Binfluid>Stebs gypt =FrartitleLawhAfdireclesD&amp;s]], N

  <There ]], and >
  <page>
    <titlejounder agworld- &quot;ocuA te&lt;/tbr&gt [[*  '''[[Gistem}}homeCthe blegCabbitcomz'''e]])
===anish and) es] t onAmericunder arStthe th www.opyus===

{{IPAVerc</textIAsential [[JLat its k Sokrightton]]an[[Ione 14was Sthe [[is the [[orBed to eston[[ed his || Island>
      </ound lighttheation, [http://of [[anphiloston ates ming a [[ant]])buingparA k. The e]]|| :is s of codefilm before L </idHuport

## Pretrained on wiki, finetine for Shakespeare

In [49]:
# Load the wiki vocabulary
bpe_wiki = BytePairEncoding()

with open('bpe_wiki_vocab.pkl', 'rb') as f:
    bpe_wiki.vocab = pickle.load(f)

with open('bpe_wiki_merges.pkl', 'rb') as f:
    bpe_wiki.merges = pickle.load(f)

In [50]:
# Data loading
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# Reading the database file
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
train_text, val_text, test_text = split_data(text)
encoding = Encoding(val_text,bpe_wiki,num_proc=4)
val = encoding.map_token()
val_data_bpere_ft =encoding.transform_type(val)

encoding = Encoding(train_text,bpe_wiki,num_proc=4)
train = encoding.map_token()
train_data_bpere_ft =encoding.transform_type(train)

--2024-04-17 19:23:49--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
正在解析主机 raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
正在连接 raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：1115394 (1.1M) [text/plain]
正在保存至: “input.txt.8”


2024-04-17 19:23:50 (2.03 MB/s) - 已保存 “input.txt.8” [1115394/1115394])



Map (num_proc=4):   0%|          | 0/4030 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/31497 [00:00<?, ? examples/s]

In [51]:
config = {
    'n_embd': 488,
    'n_head': 8,
    'n_layer': 10,
    'block_size': 32,
    'dropout': 0.1,
    'batch_size': 16,
    'learning_rate': 0.0001,
    'vocab_size': 3257
}
pretrained_model = MiniGPTModel(
        n_embd=config['n_embd'],
        n_head=config['n_head'],
        n_layer=config['n_layer'],
        block_size=config['block_size'],
        dropout =config['dropout'],
        vocab_size = config['vocab_size']
    ).to(device)

In [52]:
# Load the pre-trained model
pretrained_model.load_state_dict(torch.load(model_dir + '/wiki_direct_train.pth'))

<All keys matched successfully>

In [53]:
# Freeze the parameters
for name, param in pretrained_model.named_parameters():
    if 'lm_head' not in name and 'blocks.3' not in name and 'ln_f' not in name:
        param.requires_grad = False
    else:
        param.requires_grad = True
        print(name)

blocks.3.sa.heads.0.key.weight
blocks.3.sa.heads.0.query.weight
blocks.3.sa.heads.0.value.weight
blocks.3.sa.heads.1.key.weight
blocks.3.sa.heads.1.query.weight
blocks.3.sa.heads.1.value.weight
blocks.3.sa.heads.2.key.weight
blocks.3.sa.heads.2.query.weight
blocks.3.sa.heads.2.value.weight
blocks.3.sa.heads.3.key.weight
blocks.3.sa.heads.3.query.weight
blocks.3.sa.heads.3.value.weight
blocks.3.sa.heads.4.key.weight
blocks.3.sa.heads.4.query.weight
blocks.3.sa.heads.4.value.weight
blocks.3.sa.heads.5.key.weight
blocks.3.sa.heads.5.query.weight
blocks.3.sa.heads.5.value.weight
blocks.3.sa.heads.6.key.weight
blocks.3.sa.heads.6.query.weight
blocks.3.sa.heads.6.value.weight
blocks.3.sa.heads.7.key.weight
blocks.3.sa.heads.7.query.weight
blocks.3.sa.heads.7.value.weight
blocks.3.sa.proj.weight
blocks.3.sa.proj.bias
blocks.3.ffwd.net.0.weight
blocks.3.ffwd.net.0.bias
blocks.3.ffwd.net.2.weight
blocks.3.ffwd.net.2.bias
blocks.3.ln1.weight
blocks.3.ln1.bias
blocks.3.ln2.weight
blocks.3.ln2.bia

In [55]:
finetuned_shakespeare, _, _ = finetune_model(pretrained_model, config, train_data_bpere_ft, val_data_bpere_ft)
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(bpe_wiki.decode(finetuned_shakespeare.generate(context, max_new_tokens=2000,block_size = 32)[0].tolist()))

31.824761 M parameters
step 0: train loss 4.0689, val loss 4.2165
step 100: train loss 3.9612, val loss 4.1475
step 200: train loss 3.8714, val loss 4.0726
step 300: train loss 3.8048, val loss 3.9913
step 400: train loss 3.7413, val loss 3.9578
step 500: train loss 3.6987, val loss 3.9472
step 600: train loss 3.6634, val loss 3.9233
step 700: train loss 3.6279, val loss 3.8951
step 800: train loss 3.6115, val loss 3.8875
step 900: train loss 3.5555, val loss 3.8511
step 1000: train loss 3.5339, val loss 3.8568
step 1100: train loss 3.4953, val loss 3.8316
step 1200: train loss 3.4700, val loss 3.7918
step 1300: train loss 3.4617, val loss 3.7943
step 1400: train loss 3.4314, val loss 3.7823
step 1500: train loss 3.4251, val loss 3.7529
step 1600: train loss 3.3967, val loss 3.7586
step 1700: train loss 3.3780, val loss 3.7247
step 1800: train loss 3.3638, val loss 3.7275
step 1900: train loss 3.3604, val loss 3.7138
step 2000: train loss 3.3358, val loss 3.7228
step 2100: train loss 3

In [56]:
torch.save(finetuned_shakespeare.state_dict(), model_dir + '/shakespeare_finetune.pth')