In [1]:
# Listing 7.1 Downloading the dataset
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else: #A
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_mainchapter-code/instruction-data.json"

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

#A 如果文件已经下载，就跳过下载过程

Number of entries: 1100


In [2]:
data[50]

{'instruction': 'Identify the correct spelling of the following word.',
 'input': 'Ocassion',
 'output': "The correct spelling is 'Occasion.'"}

In [3]:
data[999]

{'instruction': "What is an antonym of 'complicated'?",
 'input': '',
 'output': "An antonym of 'complicated' is 'simple'."}

In [4]:
# Listing 7.2 Implementing the prompt formatting function
def format_input(entry):
    pretext=(
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n##Instruction:\n{entry['instruction']}"
    )
    midtext=f"\n\n##Input:\n{entry['input']}" if entry['input'] else ''
    return pretext+midtext

In [5]:
model_input = format_input(data[0])
desired_response = f"\n\n### Response:\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

##Instruction:
Evaluate the following phrase by transforming it into the spelling given.

##Input:
freind --> friend

### Response:
The correct spelling is 'Occasion.'


In [6]:
# Listing 7.3 Partitioning the dataset
train_portion = int(len(data) * 0.85) # 85% for training
test_portion = int(len(data) * 0.1) # 10% for testing
val_portion = len(data) - train_portion - test_portion # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


In [7]:
# Listing 7.4 Implementing an instruction dataset class
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data=data
        self.encoded_texts = []
        for text in data:
            ttext=format_input(text)
            output=f"\n\n##Output:\n{text['output']}"
            self.encoded_texts.append(tokenizer.encode(ttext+output))
    def __getitem__(self,index):
        return self.encoded_texts[index]
    def __len__(self):
        return len(self.data)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))
# The resulting token ID is 50256.

[50256]


In [9]:
# 开发一个自定义的`collate`函数并传递给数据加载器。
# 该自定义`collate`函数会将每个批次中的训练样本填充到相同长度，同时允许不同批次中的样本具有不同的长度
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    input_ls=[]
    max_length=max(len(data) for data in batch)
    for data in batch:
        temp=data.copy()
        temp+=[pad_token_id]*(max_length-len(temp))
        
        input_ls.append(torch.tensor(temp))
    return torch.stack(input_ls).to(device) 

In [10]:
input=(
    [1,2,3],
    [3,4],
    [5,6,7,8]
)
custom_collate_draft_1(input)

tensor([[    1,     2,     3, 50256],
        [    3,     4, 50256, 50256],
        [    5,     6,     7,     8]])

In [11]:
# 得到input和target
# 开发一个自定义的`collate`函数并传递给数据加载器。
# 该自定义`collate`函数会将每个批次中的训练样本填充到相同长度，同时允许不同批次中的样本具有不同的长度
def custom_collate_draft_2(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    input_ls=[]
    target_ls=[]
    max_length=max(len(data)+1 for data in batch)
    for data in batch:
        temp=data.copy()
        temp+=[pad_token_id]
        temp+=[pad_token_id]*(max_length-len(temp))
        inputs=temp[:-1]
        targets=temp[1:]
        
        
        
        input_ls.append(torch.tensor(inputs))
        target_ls.append(torch.tensor(targets))
        
    return (torch.stack(input_ls).to(device),torch.stack(target_ls).to(device)) 

In [12]:
batch=(
    [1,2,3],
    [3,4],
    [5,6,7,8]
)
a,b=custom_collate_draft_2(input)
b

tensor([[    2,     3, 50256, 50256],
        [    4, 50256, 50256, 50256],
        [    6,     7,     8, 50256]])

In [13]:
# 遮蔽指令，-100
# 为什么 -100 会被交叉熵损失函数忽略呢？在 PyTorch 中，cross_entropy 函数的默认设置是 `cross_entropy(..., ignore_index=-100)`，
# 这意味着它会忽略标签为 -100 的目标。
# 通过对指令部分对应的目标 token ID 进行掩码
# 交叉熵损失仅计算生成响应的目标 token ID，模型在训练时也会专注于生成准确的回答，而不是去记住指令内容，从而有助于减少过拟合。
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    input_ls=[]
    target_ls=[]
    max_length=max(len(data)+1 for data in batch)
    for data in batch:
        temp=data.copy()
        temp+=[pad_token_id]
        temp+=[pad_token_id]*(max_length-len(temp))
        inputs=temp[:-1]
        targets=temp[1:]
        inputs=torch.tensor(inputs)
        targets=torch.tensor(targets)
        
        mask = targets == pad_token_id                  #A
#         print(mask)
        # nonzero得到索引，但是是二维形式
        # squeeze压缩到一维数组
        indices = torch.nonzero(mask).squeeze()         #A
#         print(indices)
        if indices.numel() > 1:                         #A
            targets[indices[1:]] = ignore_index         #A

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]        #B
            targets = targets[:allowed_max_length]      #B
        
        input_ls.append(inputs)
        target_ls.append(targets)
        
    return (torch.stack(input_ls).to(device),torch.stack(target_ls).to(device)) 

In [14]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    1,     2,     3, 50256],
        [    3,     4, 50256, 50256],
        [    5,     6,     7,     8]])
tensor([[    2,     3, 50256,  -100],
        [    4, 50256,  -100,  -100],
        [    6,     7,     8, 50256]])


In [15]:
c=b[1]
c

tensor([    4, 50256, 50256, 50256])

In [16]:
mask=c==50256
mask

tensor([False,  True,  True,  True])

In [17]:
d=torch.nonzero(mask).squeeze()
d

tensor([1, 2, 3])

In [18]:
c[d[1:]]=-100
c

tensor([    4, 50256,  -100,  -100])

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if torch.backends.mps.is_available():       #A
#     device = torch.device("mps")"           #A
print("Device:", device)

#A 取消这两行注释以在 Apple Silicon 芯片上启用 GPU

Device: cuda


In [20]:
from functools import partial
# 为该函数创建一个预先填充 `device` 参数的新版本
customized_collate_fn = partial(custom_collate_fn, device=device,
allowed_max_length=1024)

In [21]:
# Listing 7.6 Initializing the data loaders
from torch.utils.data import DataLoader

num_workers = 0            #A
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
# collate_fn单独对每一个batch调整
# train_dataset的getitem返回的是一个batch
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

#A 如果操作系统支持并行的 Python 进程，你可以尝试增加此数值。

In [22]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 81]) torch.Size([8, 81])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 78]) torch.Size([8, 78])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 84]) torch.Size([8, 84])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 81]) torch.Size([8, 81])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 69]) torch.

In [23]:
# Listing 7.7 Loading the pretrained model
from gpt_download import download_and_load_gpt2
from chapter04 import GPTModel
from chapter05_5 import load_weights_into_gpt

BASE_CONFIG = {
    "vocab_size": 50257, # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0, # Dropout rate
    "qkv_bias": True # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2\355M\checkpoint
File already exists and is up-to-date: gpt2\355M\encoder.json
File already exists and is up-to-date: gpt2\355M\hparams.json
File already exists and is up-to-date: gpt2\355M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\355M\model.ckpt.index
File already exists and is up-to-date: gpt2\355M\model.ckpt.meta
File already exists and is up-to-date: gpt2\355M\vocab.bpe


In [24]:
torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

##Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [25]:
from chapter05 import generate, text_to_token_ids, token_ids_to_text
token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [26]:
# 用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列。
generated_text[len(input_text):].strip()

"##Result:\n\nThe active sentence is 'The chef cooks the meal every day.'\n\n##Example:\n\nThe chef cooks the meal every day."

In [27]:
from chapter05 import (
    calc_loss_loader,
    train_model_simple
)

In [28]:
model.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.7107028007507323
Validation loss: 3.6536423683166506


In [44]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

In [45]:
# if hasattr(torch.cuda, 'empty_cache'):
#     torch.cuda.empty_cache()

In [29]:
# # Listing 7.8 Instruction finetuning the pretrained LLM
# import time

# start_time = time.time()
# torch.manual_seed(123)
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
# num_epochs = 2

# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context=format_input(val_data[0]), tokenizer=tokenizer
# )

# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [30]:
torch.manual_seed(123)
for entry in test_data[:3]:                #A
    input_text = format_input(entry)
    token_ids = generate(                  #B
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:",
"").strip()

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")


#A 遍历测试集中的前三个样本
#B 使用在第 7.5 节导入的 generate 函数

Below is an instruction that describes a task. Write a response that appropriately completes the request.

##Instruction:
Rewrite the sentence using a simile.

##Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.

Model response:
>> ##Output:

The car is very fast.

##Example:

The car is very fast.

##Instruction:

Write a response that appropriately completes the request.

##Input:

The car is very fast.

##Output:

The car is very fast.

##Example:

The car is very fast.

##Instruction:

Write a response that appropriately completes the request.

##Input:

The car is very fast.

##Output:

The car is very fast.

##Example:

The car is very fast.

##Instruction:

Write a response that appropriately completes the request.

##Input:

The car is very fast.

##Output:

The car is very fast.

##Example:

The car is very fast.

##Instruction:

Write a response that appropriately completes the request.

##Input:

The car is very fast.

##Output:

The car is 

In [31]:
# # Listing 7.9 Generating test set responses
# from tqdm import tqdm


# # 需要30分钟
# for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
#     input_text = format_input(entry)

#     token_ids = generate(
#         model=model,
#         idx=text_to_token_ids(input_text, tokenizer).to(device),
#         max_new_tokens=256,
#         context_size=BASE_CONFIG["context_length"],
#         eos_id=50256
#     )
#     generated_text = token_ids_to_text(token_ids, tokenizer)
#     response_text = generated_text[len(input_text):].replace("### Response:",
# "").strip()
#     test_data[i]["model_response"] = response_text

# with open("instruction-data-with-response.json", "w") as file:
#     json.dump(test_data, file, indent=4) # "indent" for pretty-printing

 11%|████████▊                                                                        | 12/110 [03:31<28:49, 17.65s/it]

KeyboardInterrupt



In [32]:
# # 利用另一个更大的大语言模型对微调模型的响应进行自动化评估。
# # ollama run phi3

# # 以下代码用于验证 Ollama 会话是否正常运行
# import psutil

# def check_if_running(process_name):
#     running = False
#     for proc in psutil.process_iter(["name"]):
#         if process_name in proc.info["name"]:
#             running = True
#             break
#     return running

# ollama_running = check_if_running("ollama")

# if not ollama_running:
#     raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
# print("Ollama running:", check_if_running("ollama"))

Ollama running: True


In [33]:
# # Listing 7.10 Querying a local Ollama model
# import urllib.request

# def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
#     data = {                                                               #A
#         "model": model,
#         "seed": 123, # for deterministic responses
#         "temperature": 0, # for deterministic responses
#         "messages": [
#             {"role": "user", "content": prompt}
#         ]
#     }

#     payload = json.dumps(data).encode("utf-8")                             #B
#     request = urllib.request.Request(url, data=payload, method="POST")     #C
#     request.add_header("Content-Type", "application/json")                 #C

#     response_data = ""
#     with urllib.request.urlopen(request) as response:                      #D
#         while True:
#             line = response.readline().decode("utf-8")
#             if not line:
#                 break
#             response_json = json.loads(line)
#             response_data += response_json["message"]["content"]
#     return response_data


# #A 将数据载荷创建为字典格式
# #B 将字典转换为 JSON 格式字符串，并编码为字节数据
# #C 创建请求对象，设置方法为 POST，并添加必要的请求头
# #D 发送请求并接收响应

In [41]:
# model = "llama3"
# result = query_model("What do Llamas eat?", model)
# print(result)

HTTPError: HTTP Error 404: Not Found