In [1]:
# 开始加载
from transformers import AutoTokenizer, AutoConfig

model_path = "/root/autodl-tmp/model/chatglm-6b"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)



In [2]:
# 测试文本粉刺
text = "我爱学习"
tokens = tokenizer.encode(text)
print("tokens:", tokens)

tokens: [5, 76202, 63992, 130001, 130004]


In [5]:
print(tokenizer.tokenize(text))

['▁', '我爱', '学习']


In [8]:
import sentencepiece as spm

ice_text_model_path = "/root/autodl-tmp/model/chatglm-6b/ice_text.model"
vocab_path = "/root/autodl-tmp/model/chatglm-6b/vocab.txt"

sp = spm.SentencePieceProcessor()
sp.load(ice_text_model_path)

save_vocab = []
for id in range(sp.vocab_size()):
    save_vocab.append(str(id) + "\t" + sp.id_to_piece(id))
    print(sp.id_to_piece(id))

with open(vocab_path, "w+", encoding="utf-8") as f:
    f.write("\n".join(save_vocab))

<unk>
<s>
</s>
<pad>
<n>
▁
,
.
0
1
2
-
:
3
)
5
4
"
9
▁(
(
6
'
8
_
7
/
;
▁"
▁=
’
?
*
>
▁-
!
▁{
▁$
▁+
}
%
="
]
\
▁*
).
▁<
);
▁“
▁}
{
$
[
▁[
=
),
**
",
“
...
<
#
”
▁\
."
+
()
</
▁'
▁–
▁&
://
▁//
.”
("
▁@
//
^
。"
">
->
();
,”
":
▁|
|
~
::
@
('
:"
▁#
▁—
▁$\
&
·
_{
▁*/
▁</
–
▁the
▁of
▁and
▁to
▁a
▁in
s
▁is
▁for
▁that
▁with
▁on
▁it
▁be
▁as
▁I
▁was
▁by
▁are
t
▁you
▁this
▁from
▁The
▁at
▁an
▁or
▁have
▁not
▁can
The
a
▁has
▁were
▁will
A
▁which
▁but
d
I
▁we
▁if
▁your
m
▁he
▁one
▁all
▁his
▁more
x
e
n
▁their
▁they
▁also
i
▁been
▁when
▁about
S
▁so
▁do
▁my
▁time
▁new
▁out
▁said
▁A
▁would
c
▁other
▁had
▁who
▁there
▁In
▁up
▁like
▁than
b
▁use
▁into
ing
p
▁its
▁no
▁two
y
▁first
▁some
▁may
▁only
▁get
C
▁after
▁It
▁our
▁what
Category
r
g
▁This
ed
▁any
re
f
▁just
▁such
▁using
▁these
▁over
o
▁data
▁used
▁between
▁her
In
in
h
▁them
▁people
▁then
▁me
B
k
com
▁how
▁could
▁i
▁return
▁should
▁de
▁We
▁where
▁well
D
▁patients
▁need
▁most
▁make
▁work
▁years
▁because
er
▁way
▁through
v
l
▁want
u
▁class
▁each
▁C
▁she
id


In [9]:
len(save_vocab)

130344

In [10]:
vocab = tokenizer.get_vocab()

len(vocab)

130344

In [14]:
vocab["▁我是"]

vocab["▁"]

5

In [16]:
text1 = "苹果是我昨天买的"
text2 = "我是昨天买的苹果"

token1 = tokenizer.encode(text1, add_special_tokens=False)
token2 = tokenizer.encode(text2, add_special_tokens=False)

print(f"token1: {token1}, token2: {token2}")

vocab_exchange = {value: key for key, value in vocab.items()}

participles1 = [vocab_exchange[item] for item in token1]
participles2 = [vocab_exchange[item] for item in token2]

print(f"participles1: {participles1}, participles2: {participles2}")


token1: [5, 65319, 66872, 67363, 68543], token2: [71232, 67363, 68543, 65319]
participles1: ['▁', '苹果', '是我', '昨天', '买的'], participles2: ['▁我是', '昨天', '买的', '苹果']


In [17]:
text3 = "我是   昨天买的苹果"
text4 = "我是 昨天买的苹果"

token3 = tokenizer.encode(text3, add_special_tokens=False)
token4 = tokenizer.encode(text4, add_special_tokens=False)

participles3 = [vocab_exchange[item] for item in token3]
participles4 = [vocab_exchange[item] for item in token4]

print(f"participles3: {participles3}, participles4: {participles4}")


participles3: ['▁我是', '<|blank_3|>', '昨天', '买的', '苹果'], participles4: ['▁我是', '▁昨天', '买的', '苹果']


In [21]:
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, device_map="auto")

example = {
    "context": "你是谁",
    "target": "人家是城堡中的小公主"
}

max_seq_length=200

prompt = example["context"]
target = example["target"]

prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)

target_ids = tokenizer.encode(
    target,
    max_length=max_seq_length,
    truncation=True,
    add_special_tokens=False
)

input_ids = prompt_ids + target_ids + [config.eos_token_id]

print({"input_ids": input_ids, "seq_len": len(prompt_ids)})


{'input_ids': [5, 108293, 130001, 130004, 5, 65870, 63829, 75581, 64102, 103559, 130005], 'seq_len': 4}


In [23]:
participles_all = [vocab_exchange[item] for item in input_ids]

print(participles_all)

['▁', '你是谁', '[gMASK]', '<sop>', '▁', '人家', '是', '城堡', '中的', '小公主', '<eop>']


In [27]:
# 再看一下解码过程，就是按照词典，将token_id转换成字符串，同时连接符会被去掉
tokens = [5, 108293, 130001, 130004, 5, 65870, 63829, 75581, 64102, 103559, 130005]

participles = [vocab_exchange[token] for token in tokens]
print(participles)

decode_tokens = tokenizer.decode(tokens)
print(decode_tokens)

['▁', '你是谁', '[gMASK]', '<sop>', '▁', '人家', '是', '城堡', '中的', '小公主', '<eop>']
你是谁 人家是城堡中的小公主


In [None]:
# 词典(ice_text.model)是使用sentencepiece包中的BPE算法
# sentencepiece包中实现了BPE, Unigram, Word, Char四种算法
# llama和ChatGLM都使用了这个包来进行分词，并且生成词典