In [14]:
def decode_utf8_bytes_to_str_wrong(bytestring: bytes):
    print(list(bytestring))
    print([bytes([b]).decode("utf-8") for b in bytestring])
    return "".join([bytes([b]).decode("utf-8") for b in bytestring])

decode_utf8_bytes_to_str_wrong("你好".encode("utf-8"))

[228, 189, 160, 229, 165, 189]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 0: unexpected end of data

In [25]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

import regex as re
re.finditer(PAT, "some text that i'll pre-tokenize 3.12 114514 <你好小笼包>")

corpus = "hello world <|begin|> this is a test <|end|> 1234 <|pad|> <|unk|>"
special_tokens = ["<|begin|>", "<|end|>", "<|pad|>", "<|unk|>"]
pattern = "|".join(re.escape(token) for token in special_tokens)
docs = re.split(pattern, corpus) 
docs

['hello world ', ' this is a test ', ' 1234 ', ' ', '']

In [26]:
corpus = "hello world hello hello hello<|begin|> this is a test <|end|> 1234 <|pad|> <|unk|>"
special_tokens = ["<|begin|>", "<|end|>", "<|pad|>", "<|unk|>"]

# 去除 special_tokens
pattern = "|".join(re.escape(token) for token in special_tokens)
docs = re.split(pattern, corpus)    # 将文件按照<|endoftext|>切分为多个文档

# 预分词，建立词频字典
word_freq = {}
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

for doc in docs:
    if not doc:
        continue
    for word in re.finditer(PAT, doc):
        word_bytes = word.group(0).encode('utf-8')
        if word_bytes in word_freq:
            word_freq[word_bytes] += 1
        else:
            word_freq[word_bytes] = 1
word_freq

{b'hello': 1,
 b' world': 1,
 b' hello': 3,
 b' this': 1,
 b' is': 1,
 b' a': 1,
 b' test': 1,
 b' ': 3,
 b' 1234': 1}

In [4]:
print("this is a test" + chr(0) + "string")

this is a test string


In [2]:
import numpy as np

def make_rope_rotation(theta, dim):
    """构建 RoPE 旋转矩阵，维度 dim 必须为偶数"""
    assert dim % 2 == 0
    R = np.zeros((dim, dim))
    for i in range(0, dim, 2):
        cos_t = np.cos(theta)
        sin_t = np.sin(theta)
        R[i, i] = cos_t
        R[i, i+1] = -sin_t
        R[i+1, i] = sin_t
        R[i+1, i+1] = cos_t
    return R

def test_commutativity(W, x, R, eps=1e-6):
    """测试是否 R @ (W @ x) == W @ (R @ x)"""
    r1 = R @ (W @ x)
    r2 = W @ (R @ x)
    return np.allclose(r1, r2, atol=eps)

def run_tests():
    dim = 8
    theta = np.pi / 4  # 45度旋转
    R = make_rope_rotation(theta, dim)
    x = np.random.randn(dim)

    A = np.random.randn(dim, dim)
    test_cases = {
        "等比缩放矩阵": np.eye(dim) * 2,
        "对角矩阵（非等比）": np.diag(np.arange(1, dim+1)),
        "随机矩阵": np.random.randn(dim, dim),
        "上三角矩阵": np.triu(np.random.randn(dim, dim)),
        "下三角矩阵": np.tril(np.random.randn(dim, dim)),
        "对称矩阵": A @ A.T,
    }

    print(f"{'类型':<12} | 是否相等")
    print("-" * 30)
    for name, W in test_cases.items():
        if callable(W): W = W()
        same = test_commutativity(W, x, R)
        print(f"{name:<12} | {'✅ 相等' if same else '❌ 不相等'}")

run_tests()

类型           | 是否相等
------------------------------
等比缩放矩阵       | ✅ 相等
对角矩阵（非等比）    | ❌ 不相等
随机矩阵         | ❌ 不相等
上三角矩阵        | ❌ 不相等
下三角矩阵        | ❌ 不相等
对称矩阵         | ❌ 不相等
