diff --git a/README.md b/README.md
index c67f4fb..f575cad 100644
--- a/README.md
+++ b/README.md
@@ -370,33 +370,38 @@ python3 tools/moss_export.py moss-int4.flm int4 #导出int4模型
一些模型的转换可以[参考这里的例子](docs/llama_cookbook.md)
#### QWEN模型导出
+* **Qwen**
```sh
# 需要先安装QWen环境
# 如果使用自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model的代码
# 根据所需的精度,导出相应的模型
+cd build
python3 tools/qwen2flm.py qwen-7b-fp16.flm float16 #导出float16模型
python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型
python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型
```
-#### MINICPM模型导出
-```sh
-cd build
-python tools/minicpm2flm.py #导出float16模型
-./main -p minicpm-2b-float16.flm # 执行模型
-```
-
* **Qwen1.5**
```sh
# 需要先安装QWen2环境(transformers >= 4.37.0)
# 根据所需的精度,导出相应的模型
+cd build
python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型
python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型
python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型
# 最后一个参数可替换为模型路径
```
+#### MINICPM模型导出
+```sh
+# 需要先安装MiniCPM环境(transformers >= 4.36.0)
+# 默认脚本导出iniCPM-2B-dpo-fp16模型
+cd build
+python tools/minicpm2flm.py minicpm-2b-float16.flm #导出dpo-float16模型
+./main -p minicpm-2b-float16.flm # 执行模型
+```
+
## 开发计划
也就是俗称的画饼部分,大家如果有需要的功能可以在讨论区提出
diff --git a/example/Win32Demo/fastllm.vcxproj b/example/Win32Demo/fastllm.vcxproj
index e5e33c3..0e57dfd 100644
--- a/example/Win32Demo/fastllm.vcxproj
+++ b/example/Win32Demo/fastllm.vcxproj
@@ -163,7 +163,6 @@
/arch:AVX /source-charset:utf-8 %(AdditionalOptions)
- cudart.lib;cublas.lib;%(AdditionalDependencies)
Windows
true
true
@@ -181,7 +180,7 @@
-
+
diff --git a/include/models/basellm.h b/include/models/basellm.h
index 156a1d6..6130254 100644
--- a/include/models/basellm.h
+++ b/include/models/basellm.h
@@ -1,4 +1,7 @@
-#pragma once
+
+#ifndef FASTLLM_BASELLM_H
+#define FASTLLM_BASELLM_H
+
#include "fastllm.h"
#include
@@ -50,9 +53,9 @@ namespace fastllm {
this->weight.ReleaseWeight();
};
- virtual void LoadFromFile(const std::string &fileName); // 从文件读取
+ virtual void LoadFromFile(const std::string &fileName); // 从文件读取
- virtual void InitParams(); // 初始化参数信息
+ virtual void InitParams(); // 初始化参数信息
// 推理
virtual int Forward(
@@ -85,12 +88,12 @@ namespace fastllm {
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector *> *logits = nullptr);
- // 根据输入的tokens生成LLM推理的输入
+ // 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputs(std::vector > &inputTokens,
const std::map ¶ms,
Data &inputIds, Data &attentionMask, Data &positionIds);
- // 根据输入的tokens生成LLM推理的输入
+ // 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputsBatch(std::vector > &inputTokens,
const std::vector > ¶ms,
Data &inputIds, Data &attentionMask, Data &positionIds);
@@ -102,16 +105,16 @@ namespace fastllm {
virtual void ResponseBatch(const std::vector &inputs,
std::vector &outputs,
RuntimeResultBatch retCb = nullptr,
- const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
+ const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
virtual int LaunchResponseTokens(const std::vector &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId
- virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了
+ virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了
virtual int FetchResponseLogits(int handleId, std::vector &logits); // 获取指定handle的输出Logits
- virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
+ virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
virtual void SaveModel(const std::string &fileName); // 直接导出
@@ -158,3 +161,5 @@ namespace fastllm {
int tokensLimit = -1;
};
}
+
+#endif //FASTLLM_BASELLM_H
\ No newline at end of file
diff --git a/include/models/minicpm.h b/include/models/minicpm.h
index b1849b6..6585c6a 100644
--- a/include/models/minicpm.h
+++ b/include/models/minicpm.h
@@ -6,12 +6,13 @@
#define FASTLLM_MINICPM_H
#include "basellm.h"
+#include "llama.h"
#include "cmath"
#include
namespace fastllm {
- class MiniCpmModel: public basellm {
+ class MiniCpmModel: public LlamaModel {
public:
MiniCpmModel(); // 构造函数
@@ -48,26 +49,6 @@ namespace fastllm {
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector *> *logits = nullptr);
- virtual std::string Response(const std::string& input,
- RuntimeResult retCb,
- const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复
-
- virtual void ResponseBatch(const std::vector &inputs,
- std::vector &outputs,
- RuntimeResultBatch retCb,
- const GenerationConfig &generationConfig = GenerationConfig());
-
- virtual int LaunchResponseTokens(const std::vector &inputTokens,
- const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId
-
- virtual int FetchResponseTokens(int handelId); // 获取指定handle的输出, -1代表输出结束了
-
- virtual void WarmUp(); // 预热
-
- virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
-
- virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
-
private:
float embed_scale = 1.f;
diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp
index 1ee4aa3..ff5a9ae 100644
--- a/src/models/minicpm.cpp
+++ b/src/models/minicpm.cpp
@@ -18,32 +18,6 @@
namespace fastllm {
- std::vector GetInterLeavePowerOf3(int n) {
- float start = powf(2, -powf(2, -(log2f(n) - 3)));
- float ratio = start;
- std::vector ret;
- for (int i = 0; i < n; i++) {
- ret.push_back(start * powf(ratio, i));
- }
- return ret;
- }
- std::vector GetInterleave2(int n) {
- int base = 1;
- while (base < n) {
- base <<= 1;
- }
- if (base == n) {
- return GetInterLeavePowerOf3(n);
- } else {
- std::vector ret = GetInterLeavePowerOf3(base / 2);
- std::vector part2 = GetInterLeavePowerOf3(base);
- for (int i = 0; i < n - base / 2; i++) {
- ret.push_back(part2[i * 2]);
- }
- return ret;
- }
- }
-
MiniCpmModel::MiniCpmModel() {
this->model_type = "minicpm";
@@ -100,11 +74,6 @@ namespace fastllm {
const fastllm::Data &positionIds, std::vector> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector *retLogits) {
- Data alibiData;
- if (this->weight.dicts["use_alibi"] == "1") {
- std::vector alibi = GetInterleave2(num_attention_heads);
- alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
- }
int maxLen = inputIds.dims[1];
Data hiddenStates;
@@ -145,11 +114,18 @@ namespace fastllm {
k.Reshape(qkvSize);
v.Reshape(qkvSize);
- if (alibiData.dims.size() == 0) {
- fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
- fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+ Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+ if (GetKVCacheInCPU()) {
+ pastKey.lockInCPU = true;
+ pastValue.lockInCPU = true;
+ } else {
+ pastKey.ToDevice(DataDevice::CUDA);
+ pastValue.ToDevice(DataDevice::CUDA);
}
+ fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+ fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+
qkvSize = {bsz * seqlen, num_attention_heads, -1};
q.Reshape(qkvSize);
k.Reshape(qkvSize);
@@ -159,15 +135,6 @@ namespace fastllm {
PermuteSelf(k, {1, 0, 2});
PermuteSelf(v, {1, 0, 2});
- Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
- if (GetKVCacheInCPU()) {
- pastKey.lockInCPU = true;
- pastValue.lockInCPU = true;
- } else {
- pastKey.ToDevice(DataDevice::CUDA);
- pastValue.ToDevice(DataDevice::CUDA);
- }
-
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
@@ -201,9 +168,7 @@ namespace fastllm {
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
- if (alibiData.dims.size() != 0) {
- AlibiMask(attenWeights, alibiData, -10000);
- } else if (attentionMask.dims.size() != 0) {
+ if (attentionMask.dims.size() != 0) {
AttentionMask(attenWeights, attentionMask, -10000);
}
@@ -215,8 +180,8 @@ namespace fastllm {
attenOutput.Reshape({bsz, seqlen, -1});
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
- Mul(attenLastOutput, this->attention_scale, attenLastOutput);
- AddTo(hiddenStates, attenLastOutput);
+ // Mul(attenLastOutput, this->attention_scale, attenLastOutput);
+ AddTo(hiddenStates, attenLastOutput, this->attention_scale);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
@@ -224,8 +189,8 @@ namespace fastllm {
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
- Mul(w2, this->attention_scale, w2);
- AddTo(hiddenStates, w2);
+ // Mul(w2, this->attention_scale, w2);
+ AddTo(hiddenStates, w2, this->attention_scale);
}
Data logits, topk;
Data tempHiddenStates;
@@ -265,11 +230,6 @@ namespace fastllm {
const fastllm::Data &positionIds, std::vector> &pastKeyValues,
const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
std::vector *> *retLogits) {
- Data alibiData;
- if (this->weight.dicts["use_alibi"] == "1") {
- std::vector alibi = GetInterleave2(num_attention_heads);
- alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
- }
int maxLen = inputIds.dims[1];
Data hiddenStates;
@@ -311,11 +271,18 @@ namespace fastllm {
k.Reshape(qkvSize);
v.Reshape(qkvSize);
- if (alibiData.dims.size() == 0) {
- fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
- fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+ Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
+ if (GetKVCacheInCPU()) {
+ pastKey.lockInCPU = true;
+ pastValue.lockInCPU = true;
+ } else {
+ pastKey.ToDevice(DataDevice::CUDA);
+ pastValue.ToDevice(DataDevice::CUDA);
}
+ fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim);
+ fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim);
+
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
@@ -325,15 +292,6 @@ namespace fastllm {
k.Reshape(qkvSize);
v.Reshape(qkvSize);
- Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second;
- if (GetKVCacheInCPU()) {
- pastKey.lockInCPU = true;
- pastValue.lockInCPU = true;
- } else {
- pastKey.ToDevice(DataDevice::CUDA);
- pastValue.ToDevice(DataDevice::CUDA);
- }
-
int unitLen = 64;
#ifdef USE_CUDA
unitLen = 128;
@@ -368,11 +326,7 @@ namespace fastllm {
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
- if (alibiData.dims.size() != 0) {
- attenWeights.Reshape({-1, num_attention_heads, attenWeights.dims[2], attenWeights.dims[3]});
- AlibiMask(attenWeights, alibiData, -10000);
- attenWeights.Reshape({1, -1, attenWeights.dims[2], attenWeights.dims[3]});
- } else if (attentionMask.dims.size() != 0) {
+ if (attentionMask.dims.size() != 0) {
AttentionMask(attenWeights, attentionMask, -10000);
}
Softmax(attenWeights, attenWeights, -1);
@@ -384,8 +338,8 @@ namespace fastllm {
PermuteSelf(attenOutput, {1, 0, 2});
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
- Mul(attenLastOutput, this->attention_scale, attenLastOutput);
- AddTo(hiddenStates, attenLastOutput);
+ // Mul(attenLastOutput, this->attention_scale, attenLastOutput);
+ AddTo(hiddenStates, attenLastOutput, this->attention_scale);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
@@ -393,8 +347,8 @@ namespace fastllm {
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
- Mul(w2, this->attention_scale, w2);
- AddTo(hiddenStates, w2);
+ // Mul(w2, this->attention_scale, w2);
+ AddTo(hiddenStates, w2, this->attention_scale);
}
Data logits, topk;
@@ -440,11 +394,6 @@ namespace fastllm {
const std::vector &generationConfigs,
const LastTokensManager &lastTokens,
std::vector *> *retLogits) {
- Data alibiData;
- if (this->weight.dicts["use_alibi"] == "1") {
- std::vector alibi = GetInterleave2(num_attention_heads);
- alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi));
- }
Data hiddenStates;
Data attenInput;
Data q, k, v, qkv;
@@ -500,11 +449,18 @@ namespace fastllm {
k.Reshape(qkvSize);
v.Reshape(qkvSize);
- if (alibiData.dims.size() == 0) {
- fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim);
- fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim);
+ Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
+ if (GetKVCacheInCPU()) {
+ pastKey.lockInCPU = true;
+ pastValue.lockInCPU = true;
+ } else {
+ pastKey.ToDevice(DataDevice::CUDA);
+ pastValue.ToDevice(DataDevice::CUDA);
}
+ fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim);
+ fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim);
+
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
@@ -513,15 +469,6 @@ namespace fastllm {
q.Reshape(qkvSize);
k.Reshape(qkvSize);
v.Reshape(qkvSize);
-
- Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second;
- if (GetKVCacheInCPU()) {
- pastKey.lockInCPU = true;
- pastValue.lockInCPU = true;
- } else {
- pastKey.ToDevice(DataDevice::CUDA);
- pastValue.ToDevice(DataDevice::CUDA);
- }
int unitLen = 64;
#ifdef USE_CUDA
@@ -559,9 +506,7 @@ namespace fastllm {
// 1.2.0 q * k^T
MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim));
attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]});
- if (alibiData.dims.size() != 0) {
- AlibiMask(attenWeights, alibiData, -10000);
- } else if (attentionMask[b] != nullptr) {
+ if (attentionMask[b] != nullptr) {
AttentionMask(attenWeights, *attentionMask[b], -10000);
}
@@ -580,8 +525,8 @@ namespace fastllm {
}
Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
- Mul(attenLastOutput, this->attention_scale, attenLastOutput);
- AddTo(hiddenStates, attenLastOutput);
+ // Mul(attenLastOutput, this->attention_scale, attenLastOutput);
+ AddTo(hiddenStates, attenLastOutput, this->attention_scale);
// 2. mlp
RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
@@ -589,8 +534,8 @@ namespace fastllm {
Silu(w1, w1);
MulTo(w1, w3);
Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
- Mul(w2, this->attention_scale, w2);
- AddTo(hiddenStates, w2);
+ // Mul(w2, this->attention_scale, w2);
+ AddTo(hiddenStates, w2, this->attention_scale);
}
Data logits, curLogit;
@@ -619,477 +564,4 @@ namespace fastllm {
return lastRet;
}
- std::string MiniCpmModel::Response(const std::string& input, RuntimeResult retCb,
- const GenerationConfig &generationConfig) {
-#ifdef USE_CUDA
- FastllmCudaClearBigBuffer();
-#endif
-//auto st = std::chrono::system_clock::now();
-#ifdef PY_API
- size_t pos = input.rfind("time_stamp:");
- std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input;
- size_t hash_id = std::hash{}(input);
- Data inputIds = this->weight.tokenizer.Encode(prompt);
-#else
- Data inputIds = this->weight.tokenizer.Encode(input);
-#endif
- std::vector ids;
- for (int i = 0; i < inputIds.Count(0); i++) {
- ids.push_back(((float*)inputIds.cpuData)[i]);
- }
- int seqLen = ids.size();
- inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, ids));
-
- std::vector vmask = std::vector (seqLen * seqLen, 0);
- std::vector vpids = std::vector (seqLen, 0);
- for (int i = 0; i < seqLen; i++) {
- vpids[i] = i;
- for (int j = i + 1; j < seqLen; j++) {
- vmask[i * seqLen + j] = 1;
- }
- }
-
- Data attentionMask = Data(DataType::FLOAT32, {seqLen, seqLen}, vmask);
- Data positionIds = Data(DataType::FLOAT32, {1, seqLen}, vpids);
-
- std::vector > pastKeyValues;
- for (int i = 0; i < block_cnt; i++) {
- pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
- Data(DataType::FLOAT32)));
- }
-
- std::string retString = "";
- int len = seqLen;
- std::vector results;
- int index = 0;
-
- LastTokensManager tokens (1, generationConfig.last_n);
- while (true) {
- auto st = std::chrono::system_clock::now();
-
- int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens);
- tokens.units[0].Push(ret);
- if (ret == eos_token_id) {
- break;
- }
-
- results.push_back(ret);
- std::string curString = weight.tokenizer.Decode(Data(DataType::FLOAT32, {(int)results.size()}, results)).c_str();
- retString += curString;
- if (retCb)
-#ifdef PY_API
- {
- if (generationConfig.enable_hash_id) {
- std::stringstream ss;
- ss << retString << "hash_id:" << hash_id;
- retCb(index, pybind11::bytes(ss.str()));
- } else {
- retCb(index, pybind11::bytes(retString));
- }
- }
-#else
- retCb(index, curString.c_str());
-#endif
- index++;
-
- if (index == generationConfig.output_token_limit) {
- break;
- }
- results.clear();
-
- attentionMask.ToDevice(DataDevice::CPU);
- positionIds.ToDevice(DataDevice::CPU);
- inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)ret}));
- attentionMask = Data();
- positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)len}));
- //if (do_sample) {
- // tokenPenaltyManager.InsertToken(ret);
- //}
- len++;
- if (index == generationConfig.output_token_limit) {
- break;
- }
-
- //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
- }
- if (retCb)
-#ifdef PY_API
- {
- if (generationConfig.enable_hash_id) {
- std::stringstream ss;
- ss << retString << "hash_id:" << hash_id;
- retCb(-1, pybind11::bytes(ss.str()));
- } else {
- retCb(-1, pybind11::bytes(retString));
- }
- }
-#else
- retCb(-1, retString.c_str());
-#endif
-
- return retString;
- }
-
- void MiniCpmModel::ResponseBatch(const std::vector &inputs, std::vector &outputs,
- RuntimeResultBatch retCb,
- const GenerationConfig &generationConfig) {
-#ifdef USE_CUDA
- FastllmCudaClearBigBuffer();
-#endif
-#ifdef PY_API
- std::vector prompts;
- std::vector < size_t > hash_ids;
- for (auto _input: inputs){
- size_t hash_id = std::hash{}(_input);
- hash_ids.push_back(hash_id);
-
- size_t pos = _input.rfind("time_stamp:");
- std::string prompt = (generationConfig.enable_hash_id && pos != -1) ? _input.substr(0, pos) : _input;
- prompts.push_back(prompt);
- }
-#else
- std::vector prompts = inputs;
-#endif
- int batch = prompts.size();
- outputs.clear();
- outputs.resize(batch, "");
-
- std::vector inputTokens;
- std::vector seqLens;
- inputTokens.resize(batch);
- seqLens.resize(batch);
- int maxLen = 0;
- for (int i = 0; i < batch; i++) {
- inputTokens[i].CopyFrom(this->weight.tokenizer.Encode(prompts[i]));
- maxLen = std::max(maxLen, (int)inputTokens[i].Count(0));
- seqLens[i] = (int)inputTokens[i].Count(0);
- }
-
- std::vector ids = std::vector (batch * maxLen, 0);
- std::vector vpids = std::vector (batch * maxLen, 0);
- std::vector vmask = std::vector (batch * maxLen * maxLen, 0);
- for (int i = 0; i < batch; i++) {
- Data &tokens = inputTokens[i];
- int len = tokens.Count(0), base = maxLen - len;
- for (int j = 0; j < len; j++) {
- ids[i * maxLen + base + j] = ((float*)tokens.cpuData)[j];
- }
- for (int j = 0; j < len; j++) {
- vpids[i * maxLen + base + j] = j;
- }
-
- std::fill(vmask.data() + i * maxLen * maxLen,
- vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0);
- for (int j = maxLen - len; j < maxLen; j++) {
- std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen,
- vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0);
- }
- for (int j = 0; j < len; j++) {
- for (int k = j + 1; k < len; k++) {
- vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1;
- }
- }
- }
-
- Data inputIds = Data(DataType::FLOAT32, {batch, maxLen}, ids);
- Data attentionMask = Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask);
- Data positionIds = Data(DataType::FLOAT32, {batch, maxLen}, vpids);
-
- std::vector > pastKeyValues;
- for (int i = 0; i < block_cnt; i++) {
- pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
- Data(DataType::FLOAT32)));
- }
-
- std::string retString = "";
- std::vector lens = seqLens;
- std::vector isEnding = std::vector (batch, false);
- std::vector results;
- int index = 0;
-
- LastTokensManager tokensManager (batch, generationConfig.last_n);
- while (true) {
- auto st = std::chrono::system_clock::now();
- std::vector ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues,
- generationConfig, tokensManager);
- for (int i = 0; i < batch; i++) {
- tokensManager.units[i].Push(ret[i]);
- }
- std::vector fret;
- std::vector results;
- int endingCount = 0;
- std::vector curStrings;
- for (int i = 0; i < batch; i++) {
- fret.push_back(ret[i]);
- if (ret[i] == eos_token_id) {
- isEnding[i] = true;
- }
- if (isEnding[i]) {
- curStrings.push_back("");
- endingCount++;
- continue;
- }
- results.push_back(ret[i]);
- std::string curString = weight.tokenizer.Decode(
- Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
- outputs[i] += curString;
- curStrings.push_back(curString);
- results.clear();
- }
-
- if (endingCount == batch) {
- break;
- }
- if (retCb)
-#ifdef PY_API
- {
- if (generationConfig.enable_hash_id) {
- std::vector rtnStrings;
- for (size_t i=0; i rtnStrings;
- for (size_t i=0; i pids = std::vector (batch);
- std::vector vmasks = std::vector (batch * maxLen, 0.0f);
- for (int i = 0; i < batch; i++) {
- pids[i] = lens[i];
- lens[i]++;
- for (int j = 0; j < maxLen - lens[i]; j++) {
- vmasks[i * maxLen + j] = 1.0f;
- }
- }
- positionIds.ToDevice(DataDevice::CPU);
- attentionMask.ToDevice(DataDevice::CPU);
- attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks));
- inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret));
- positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, pids));
- if (index == generationConfig.output_token_limit) {
- break;
- }
-
- //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now()));
- }
- if (retCb)
-#ifdef PY_API
- {
- if (generationConfig.enable_hash_id) {
- std::vector rtnStrings;
- for (size_t i=0; i rtnStrings;
- for (size_t i=0; i > pastKeyValues;
- for (int i = 0; i < block_cnt; i++) {
- pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
- Data(DataType::FLOAT32)));
- }
- Forward(inputIds, attentionMask, positionIds, pastKeyValues);
- printf("finish.\n");
- }
-
- int MiniCpmModel::LaunchResponseTokens(const std::vector &inputTokens,
- const GenerationConfig &generationConfig) {
- mainLoopLocker.lock();
-
- if (mainLoop == nullptr) {
- if (mainLoop == nullptr) {
- mainLoop = new std::thread([](MiniCpmModel *model) {
- while (true) {
- std::vector attentionMasks;
- std::vector positionIds;
- std::vector > pastKeyValues;
- std::vector ids;
- std::vector seqLens;
- std::vector generationConfigs;
- LastTokensManager tokensManager;
- std::vector * > logits;
- model->dictLocker.lock();
- for (auto &it: model->responseContextDict.dicts) {
- if (it.second->isEnding) {
- continue;
- }
- generationConfigs.push_back(it.second->generationConfig);
- if (it.second->generationConfig.output_logits) {
- it.second->resultLogits.push(new std::vector ());
- logits.push_back(it.second->resultLogits.back());
- } else {
- logits.push_back(nullptr);
- }
- tokensManager.units.push_back(it.second->tokens);
- if (it.second->preTokens == 0) {
- int seqLen = it.second->currentTokens.size();
- for (int i = 0; i < it.second->currentTokens.size(); i++) {
- ids.push_back(it.second->currentTokens[i]);
- }
-
- seqLens.push_back(seqLen);
-
- std::vector vmask = std::vector (seqLen * seqLen, 0);
- std::vector vpids = std::vector (seqLen, 0);
- for (int i = 0; i < seqLen; i++) {
- vpids[i] = i;
- for (int j = i + 1; j < seqLen; j++) {
- vmask[i * seqLen + j] = 1;
- }
- }
- it.second->intParams["len"] = seqLen;
-
- attentionMasks.push_back(new Data(DataType::FLOAT32, {seqLen, seqLen}, vmask));
- positionIds.push_back(new Data(DataType::FLOAT32, {2, seqLen}, vpids));
- } else {
- int ret = it.second->currentTokens[0];
- seqLens.push_back(1);
- ids.push_back(ret);
- attentionMasks.push_back(nullptr);
- positionIds.push_back(new Data(DataType::FLOAT32, {1, 1}, {(float)it.second->intParams["len"]}));
- it.second->intParams["len"]++;
- }
-
- it.second->preTokens += seqLens.back();
- for (int i = 0; i < model->block_cnt; i++) {
- pastKeyValues.push_back(std::make_pair(&it.second->pastKeyValues[i].first,
- &it.second->pastKeyValues[i].second));
- }
- }
-
- if (seqLens.size() > 0) {
- model->dictLocker.unlock();
-#ifdef USE_CUDA
- FastllmCudaClearBigBuffer();
-#endif
- Data inputIds = Data(DataType::FLOAT32, {1, (int) ids.size()}, ids);
- std::vector ret;
- ret = model->ForwardBatch(seqLens.size(), inputIds, attentionMasks,
- positionIds, seqLens, pastKeyValues, generationConfigs,
- tokensManager, &logits);
- model->dictLocker.lock();
- int idx = 0;
- for (auto &it: model->responseContextDict.dicts) {
- if (it.second->isEnding) {
- continue;
- }
- int curRet = ret[idx++];
- if (curRet == model->eos_token_id) {
- it.second->isEnding = true;
- } else {
- auto itStopTk = it.second->generationConfig.stop_token_ids.find(curRet);
- if (itStopTk != it.second->generationConfig.stop_token_ids.end()) {
- it.second->isEnding = true;
- }
- }
- if (it.second->isEnding == false) {
- it.second->currentTokens = std::vector{curRet};
- it.second->resultTokenQueue.push(curRet);
- it.second->tokens.Push(curRet);
- it.second->curTokens++;
- if (it.second->curTokens == it.second->generationConfig.output_token_limit) {
- it.second->isEnding = true;
- }
- }
- }
- }
-
- for (int i = 0; i < attentionMasks.size(); i++) {
- delete attentionMasks[i];
- }
- for (int i = 0; i < positionIds.size(); i++) {
- delete positionIds[i];
- }
-
- model->dictLocker.unlock();
- MySleep(0);
- }
- }, this);
- }
- }
- mainLoopLocker.unlock();
-
- dictLocker.lock();
- int handleId = responseContextDict.CreateHandle();
- ResponseContext *context = responseContextDict.GetHandle(handleId);
- context->Init(this->block_cnt);
-
- context->currentTokens = inputTokens;
- context->currentTokens.insert(context->currentTokens.begin(), this->bos_token_id);
- context->generationConfig = generationConfig;
- context->tokens = LastTokensUnit(generationConfig.last_n);
- dictLocker.unlock();
- return handleId;
- }
-
- int MiniCpmModel::FetchResponseTokens(int handleId) {
- dictLocker.lock();
- ResponseContext *context = responseContextDict.GetHandle(handleId);
- if (context == nullptr) {
- dictLocker.unlock();
- return -1;
- } else {
- while (true) {
- if (context->resultTokenQueue.size() > 0) {
- int ret = context->resultTokenQueue.front();
- context->resultTokenQueue.pop();
- dictLocker.unlock();
- return ret;
- } else {
- if (context->isEnding) {
- responseContextDict.RemoveHandle(handleId);
- dictLocker.unlock();
- return -1;
- }
- }
- dictLocker.unlock();
- MySleep(0);
- dictLocker.lock();
- }
- }
- }
}
diff --git a/tools/scripts/minicpm2flm.py b/tools/scripts/minicpm2flm.py
index 095ea51..e434355 100644
--- a/tools/scripts/minicpm2flm.py
+++ b/tools/scripts/minicpm2flm.py
@@ -5,7 +5,7 @@
if __name__ == "__main__":
modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "openbmb/MiniCPM-2B-dpo-fp16"
- tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True)
+ tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, use_fast=False, trust_remote_code=True)
# `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32.
model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16)
model = model.eval()
@@ -14,6 +14,6 @@
dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16"
exportPath = sys.argv[1] if len(sys.argv) >= 2 else "minicpm-2b-" + dtype + ".flm"
- torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "",
+ torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "",
user_role = "<用户>", bot_role = "",
history_sep = "", dtype = dtype)