Merge pull request #420 from TylunasLi/tokenizer

修复python脚本转换模型特殊token的错误
ztxz16 · Feb 26, 2024 · f9c99aa · f9c99aa
2 parents 4af2b7d + 7878126
commit f9c99aa
Showing 1 changed file with 5 additions and 6 deletions.
diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py
@@ -6,8 +6,9 @@
 from tokenizers.decoders import ByteLevel
 
 def writeString(fo, s):
-    fo.write(struct.pack('i', len(s)))
-    fo.write(s.encode())
+    bytes = s.encode()
+    fo.write(struct.pack('i', len(bytes)))
+    fo.write(bytes)
 
 def writeKeyValue(fo, key, value):
     writeString(fo, key)
@@ -212,8 +213,7 @@ def tofile(exportPath,
         if ("tokenizer_has_special_tokens" in modelInfo):
             fo.write(struct.pack('i', len(tokenizer.all_special_tokens)))
             for special_token in tokenizer.all_special_tokens:
-                fo.write(struct.pack('i', len(special_token)))
-                fo.write(special_token.encode())
+                writeString(fo, special_token)
     else:
         fo.write(struct.pack('i', 0))
 
@@ -248,8 +248,7 @@ def tofile(exportPath,
         weight_name = key
         if hasattr(model, "peft_config"):
             weight_name = weight_name.replace('base_model.model.', '')
-        fo.write(struct.pack('i', len(weight_name)))
-        fo.write(weight_name.encode())
+        writeString(fo, weight_name)
         fo.write(struct.pack('i', len(cur.shape)))
         for i in cur.shape:
             fo.write(struct.pack('i', i))