diff --git a/README.md b/README.md index c67f4fb..f575cad 100644 --- a/README.md +++ b/README.md @@ -370,33 +370,38 @@ python3 tools/moss_export.py moss-int4.flm int4 #导出int4模型 一些模型的转换可以[参考这里的例子](docs/llama_cookbook.md) #### QWEN模型导出 +* **Qwen** ```sh # 需要先安装QWen环境 # 如果使用自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model的代码 # 根据所需的精度,导出相应的模型 +cd build python3 tools/qwen2flm.py qwen-7b-fp16.flm float16 #导出float16模型 python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型 python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型 ``` -#### MINICPM模型导出 -```sh -cd build -python tools/minicpm2flm.py #导出float16模型 -./main -p minicpm-2b-float16.flm # 执行模型 -``` - * **Qwen1.5** ```sh # 需要先安装QWen2环境(transformers >= 4.37.0) # 根据所需的精度,导出相应的模型 +cd build python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型 python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型 python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型 # 最后一个参数可替换为模型路径 ``` +#### MINICPM模型导出 +```sh +# 需要先安装MiniCPM环境(transformers >= 4.36.0) +# 默认脚本导出iniCPM-2B-dpo-fp16模型 +cd build +python tools/minicpm2flm.py minicpm-2b-float16.flm #导出dpo-float16模型 +./main -p minicpm-2b-float16.flm # 执行模型 +``` + ## 开发计划 也就是俗称的画饼部分,大家如果有需要的功能可以在讨论区提出 diff --git a/example/Win32Demo/fastllm.vcxproj b/example/Win32Demo/fastllm.vcxproj index e5e33c3..0e57dfd 100644 --- a/example/Win32Demo/fastllm.vcxproj +++ b/example/Win32Demo/fastllm.vcxproj @@ -163,7 +163,6 @@ /arch:AVX /source-charset:utf-8 %(AdditionalOptions) - cudart.lib;cublas.lib;%(AdditionalDependencies) Windows true true @@ -181,7 +180,7 @@ - + diff --git a/include/models/basellm.h b/include/models/basellm.h index 156a1d6..6130254 100644 --- a/include/models/basellm.h +++ b/include/models/basellm.h @@ -1,4 +1,7 @@ -#pragma once + +#ifndef FASTLLM_BASELLM_H +#define FASTLLM_BASELLM_H + #include "fastllm.h" #include @@ -50,9 +53,9 @@ namespace fastllm { this->weight.ReleaseWeight(); }; - virtual void LoadFromFile(const std::string &fileName); // 从文件读取 + virtual void LoadFromFile(const std::string &fileName); // 从文件读取 - virtual void InitParams(); // 初始化参数信息 + virtual void InitParams(); // 初始化参数信息 // 推理 virtual int Forward( @@ -85,12 +88,12 @@ namespace fastllm { const LastTokensManager &lastTokens = LastTokensManager(), std::vector *> *logits = nullptr); - // 根据输入的tokens生成LLM推理的输入 + // 根据输入的tokens生成LLM推理的输入 virtual void FillLLMInputs(std::vector > &inputTokens, const std::map ¶ms, Data &inputIds, Data &attentionMask, Data &positionIds); - // 根据输入的tokens生成LLM推理的输入 + // 根据输入的tokens生成LLM推理的输入 virtual void FillLLMInputsBatch(std::vector > &inputTokens, const std::vector > ¶ms, Data &inputIds, Data &attentionMask, Data &positionIds); @@ -102,16 +105,16 @@ namespace fastllm { virtual void ResponseBatch(const std::vector &inputs, std::vector &outputs, RuntimeResultBatch retCb = nullptr, - const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复 + const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复 virtual int LaunchResponseTokens(const std::vector &inputTokens, const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId - virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了 + virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了 virtual int FetchResponseLogits(int handleId, std::vector &logits); // 获取指定handle的输出Logits - virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型 + virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型 virtual void SaveModel(const std::string &fileName); // 直接导出 @@ -158,3 +161,5 @@ namespace fastllm { int tokensLimit = -1; }; } + +#endif //FASTLLM_BASELLM_H \ No newline at end of file diff --git a/include/models/minicpm.h b/include/models/minicpm.h index b1849b6..6585c6a 100644 --- a/include/models/minicpm.h +++ b/include/models/minicpm.h @@ -6,12 +6,13 @@ #define FASTLLM_MINICPM_H #include "basellm.h" +#include "llama.h" #include "cmath" #include namespace fastllm { - class MiniCpmModel: public basellm { + class MiniCpmModel: public LlamaModel { public: MiniCpmModel(); // 构造函数 @@ -48,26 +49,6 @@ namespace fastllm { const LastTokensManager &lastTokens = LastTokensManager(), std::vector *> *logits = nullptr); - virtual std::string Response(const std::string& input, - RuntimeResult retCb, - const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复 - - virtual void ResponseBatch(const std::vector &inputs, - std::vector &outputs, - RuntimeResultBatch retCb, - const GenerationConfig &generationConfig = GenerationConfig()); - - virtual int LaunchResponseTokens(const std::vector &inputTokens, - const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId - - virtual int FetchResponseTokens(int handelId); // 获取指定handle的输出, -1代表输出结束了 - - virtual void WarmUp(); // 预热 - - virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt - - virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history - private: float embed_scale = 1.f; diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 1ee4aa3..ff5a9ae 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -18,32 +18,6 @@ namespace fastllm { - std::vector GetInterLeavePowerOf3(int n) { - float start = powf(2, -powf(2, -(log2f(n) - 3))); - float ratio = start; - std::vector ret; - for (int i = 0; i < n; i++) { - ret.push_back(start * powf(ratio, i)); - } - return ret; - } - std::vector GetInterleave2(int n) { - int base = 1; - while (base < n) { - base <<= 1; - } - if (base == n) { - return GetInterLeavePowerOf3(n); - } else { - std::vector ret = GetInterLeavePowerOf3(base / 2); - std::vector part2 = GetInterLeavePowerOf3(base); - for (int i = 0; i < n - base / 2; i++) { - ret.push_back(part2[i * 2]); - } - return ret; - } - } - MiniCpmModel::MiniCpmModel() { this->model_type = "minicpm"; @@ -100,11 +74,6 @@ namespace fastllm { const fastllm::Data &positionIds, std::vector> &pastKeyValues, const GenerationConfig &generationConfig, const LastTokensManager &lastTokens, std::vector *retLogits) { - Data alibiData; - if (this->weight.dicts["use_alibi"] == "1") { - std::vector alibi = GetInterleave2(num_attention_heads); - alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi)); - } int maxLen = inputIds.dims[1]; Data hiddenStates; @@ -145,11 +114,18 @@ namespace fastllm { k.Reshape(qkvSize); v.Reshape(qkvSize); - if (alibiData.dims.size() == 0) { - fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim); - fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim); + Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second; + if (GetKVCacheInCPU()) { + pastKey.lockInCPU = true; + pastValue.lockInCPU = true; + } else { + pastKey.ToDevice(DataDevice::CUDA); + pastValue.ToDevice(DataDevice::CUDA); } + fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim); + fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim); + qkvSize = {bsz * seqlen, num_attention_heads, -1}; q.Reshape(qkvSize); k.Reshape(qkvSize); @@ -159,15 +135,6 @@ namespace fastllm { PermuteSelf(k, {1, 0, 2}); PermuteSelf(v, {1, 0, 2}); - Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second; - if (GetKVCacheInCPU()) { - pastKey.lockInCPU = true; - pastValue.lockInCPU = true; - } else { - pastKey.ToDevice(DataDevice::CUDA); - pastValue.ToDevice(DataDevice::CUDA); - } - int unitLen = 64; #ifdef USE_CUDA unitLen = 128; @@ -201,9 +168,7 @@ namespace fastllm { // 1.2.0 q * k^T MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim)); attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]}); - if (alibiData.dims.size() != 0) { - AlibiMask(attenWeights, alibiData, -10000); - } else if (attentionMask.dims.size() != 0) { + if (attentionMask.dims.size() != 0) { AttentionMask(attenWeights, attentionMask, -10000); } @@ -215,8 +180,8 @@ namespace fastllm { attenOutput.Reshape({bsz, seqlen, -1}); Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - Mul(attenLastOutput, this->attention_scale, attenLastOutput); - AddTo(hiddenStates, attenLastOutput); + // Mul(attenLastOutput, this->attention_scale, attenLastOutput); + AddTo(hiddenStates, attenLastOutput, this->attention_scale); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1); @@ -224,8 +189,8 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, this->attention_scale, w2); - AddTo(hiddenStates, w2); + // Mul(w2, this->attention_scale, w2); + AddTo(hiddenStates, w2, this->attention_scale); } Data logits, topk; Data tempHiddenStates; @@ -265,11 +230,6 @@ namespace fastllm { const fastllm::Data &positionIds, std::vector> &pastKeyValues, const GenerationConfig &generationConfig, const LastTokensManager &lastTokens, std::vector *> *retLogits) { - Data alibiData; - if (this->weight.dicts["use_alibi"] == "1") { - std::vector alibi = GetInterleave2(num_attention_heads); - alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi)); - } int maxLen = inputIds.dims[1]; Data hiddenStates; @@ -311,11 +271,18 @@ namespace fastllm { k.Reshape(qkvSize); v.Reshape(qkvSize); - if (alibiData.dims.size() == 0) { - fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim); - fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim); + Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second; + if (GetKVCacheInCPU()) { + pastKey.lockInCPU = true; + pastValue.lockInCPU = true; + } else { + pastKey.ToDevice(DataDevice::CUDA); + pastValue.ToDevice(DataDevice::CUDA); } + fastllm::LlamaRotatePosition2D(q, positionIds, sinData, cosData, rotary_dim); + fastllm::LlamaRotatePosition2D(k, positionIds, sinData, cosData, rotary_dim); + PermuteSelf(q, {0, 2, 1, 3}); PermuteSelf(k, {0, 2, 1, 3}); PermuteSelf(v, {0, 2, 1, 3}); @@ -325,15 +292,6 @@ namespace fastllm { k.Reshape(qkvSize); v.Reshape(qkvSize); - Data &pastKey = pastKeyValues[i].first, &pastValue = pastKeyValues[i].second; - if (GetKVCacheInCPU()) { - pastKey.lockInCPU = true; - pastValue.lockInCPU = true; - } else { - pastKey.ToDevice(DataDevice::CUDA); - pastValue.ToDevice(DataDevice::CUDA); - } - int unitLen = 64; #ifdef USE_CUDA unitLen = 128; @@ -368,11 +326,7 @@ namespace fastllm { // 1.2.0 q * k^T MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim)); attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]}); - if (alibiData.dims.size() != 0) { - attenWeights.Reshape({-1, num_attention_heads, attenWeights.dims[2], attenWeights.dims[3]}); - AlibiMask(attenWeights, alibiData, -10000); - attenWeights.Reshape({1, -1, attenWeights.dims[2], attenWeights.dims[3]}); - } else if (attentionMask.dims.size() != 0) { + if (attentionMask.dims.size() != 0) { AttentionMask(attenWeights, attentionMask, -10000); } Softmax(attenWeights, attenWeights, -1); @@ -384,8 +338,8 @@ namespace fastllm { PermuteSelf(attenOutput, {1, 0, 2}); Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - Mul(attenLastOutput, this->attention_scale, attenLastOutput); - AddTo(hiddenStates, attenLastOutput); + // Mul(attenLastOutput, this->attention_scale, attenLastOutput); + AddTo(hiddenStates, attenLastOutput, this->attention_scale); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1); @@ -393,8 +347,8 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, this->attention_scale, w2); - AddTo(hiddenStates, w2); + // Mul(w2, this->attention_scale, w2); + AddTo(hiddenStates, w2, this->attention_scale); } Data logits, topk; @@ -440,11 +394,6 @@ namespace fastllm { const std::vector &generationConfigs, const LastTokensManager &lastTokens, std::vector *> *retLogits) { - Data alibiData; - if (this->weight.dicts["use_alibi"] == "1") { - std::vector alibi = GetInterleave2(num_attention_heads); - alibiData.CopyFrom(Data(DataType::FLOAT32, {(int) alibi.size()}, alibi)); - } Data hiddenStates; Data attenInput; Data q, k, v, qkv; @@ -500,11 +449,18 @@ namespace fastllm { k.Reshape(qkvSize); v.Reshape(qkvSize); - if (alibiData.dims.size() == 0) { - fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim); - fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim); + Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second; + if (GetKVCacheInCPU()) { + pastKey.lockInCPU = true; + pastValue.lockInCPU = true; + } else { + pastKey.ToDevice(DataDevice::CUDA); + pastValue.ToDevice(DataDevice::CUDA); } + fastllm::LlamaRotatePosition2D(q, *positionIds[b], sinData, cosData, rotary_dim); + fastllm::LlamaRotatePosition2D(k, *positionIds[b], sinData, cosData, rotary_dim); + PermuteSelf(q, {0, 2, 1, 3}); PermuteSelf(k, {0, 2, 1, 3}); PermuteSelf(v, {0, 2, 1, 3}); @@ -513,15 +469,6 @@ namespace fastllm { q.Reshape(qkvSize); k.Reshape(qkvSize); v.Reshape(qkvSize); - - Data &pastKey = *pastKeyValues[b * block_cnt + i].first, &pastValue = *pastKeyValues[b * block_cnt + i].second; - if (GetKVCacheInCPU()) { - pastKey.lockInCPU = true; - pastValue.lockInCPU = true; - } else { - pastKey.ToDevice(DataDevice::CUDA); - pastValue.ToDevice(DataDevice::CUDA); - } int unitLen = 64; #ifdef USE_CUDA @@ -559,9 +506,7 @@ namespace fastllm { // 1.2.0 q * k^T MatMulTransB(q, pastKey, attenWeights, 1.0 / sqrt(head_dim)); attenWeights.Reshape({1, attenWeights.dims[0], attenWeights.dims[1], attenWeights.dims[2]}); - if (alibiData.dims.size() != 0) { - AlibiMask(attenWeights, alibiData, -10000); - } else if (attentionMask[b] != nullptr) { + if (attentionMask[b] != nullptr) { AttentionMask(attenWeights, *attentionMask[b], -10000); } @@ -580,8 +525,8 @@ namespace fastllm { } Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - Mul(attenLastOutput, this->attention_scale, attenLastOutput); - AddTo(hiddenStates, attenLastOutput); + // Mul(attenLastOutput, this->attention_scale, attenLastOutput); + AddTo(hiddenStates, attenLastOutput, this->attention_scale); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1); @@ -589,8 +534,8 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, this->attention_scale, w2); - AddTo(hiddenStates, w2); + // Mul(w2, this->attention_scale, w2); + AddTo(hiddenStates, w2, this->attention_scale); } Data logits, curLogit; @@ -619,477 +564,4 @@ namespace fastllm { return lastRet; } - std::string MiniCpmModel::Response(const std::string& input, RuntimeResult retCb, - const GenerationConfig &generationConfig) { -#ifdef USE_CUDA - FastllmCudaClearBigBuffer(); -#endif -//auto st = std::chrono::system_clock::now(); -#ifdef PY_API - size_t pos = input.rfind("time_stamp:"); - std::string prompt = (generationConfig.enable_hash_id && pos != -1)? input.substr(0, pos):input; - size_t hash_id = std::hash{}(input); - Data inputIds = this->weight.tokenizer.Encode(prompt); -#else - Data inputIds = this->weight.tokenizer.Encode(input); -#endif - std::vector ids; - for (int i = 0; i < inputIds.Count(0); i++) { - ids.push_back(((float*)inputIds.cpuData)[i]); - } - int seqLen = ids.size(); - inputIds.CopyFrom(Data(DataType::FLOAT32, {1, seqLen}, ids)); - - std::vector vmask = std::vector (seqLen * seqLen, 0); - std::vector vpids = std::vector (seqLen, 0); - for (int i = 0; i < seqLen; i++) { - vpids[i] = i; - for (int j = i + 1; j < seqLen; j++) { - vmask[i * seqLen + j] = 1; - } - } - - Data attentionMask = Data(DataType::FLOAT32, {seqLen, seqLen}, vmask); - Data positionIds = Data(DataType::FLOAT32, {1, seqLen}, vpids); - - std::vector > pastKeyValues; - for (int i = 0; i < block_cnt; i++) { - pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32), - Data(DataType::FLOAT32))); - } - - std::string retString = ""; - int len = seqLen; - std::vector results; - int index = 0; - - LastTokensManager tokens (1, generationConfig.last_n); - while (true) { - auto st = std::chrono::system_clock::now(); - - int ret = Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens); - tokens.units[0].Push(ret); - if (ret == eos_token_id) { - break; - } - - results.push_back(ret); - std::string curString = weight.tokenizer.Decode(Data(DataType::FLOAT32, {(int)results.size()}, results)).c_str(); - retString += curString; - if (retCb) -#ifdef PY_API - { - if (generationConfig.enable_hash_id) { - std::stringstream ss; - ss << retString << "hash_id:" << hash_id; - retCb(index, pybind11::bytes(ss.str())); - } else { - retCb(index, pybind11::bytes(retString)); - } - } -#else - retCb(index, curString.c_str()); -#endif - index++; - - if (index == generationConfig.output_token_limit) { - break; - } - results.clear(); - - attentionMask.ToDevice(DataDevice::CPU); - positionIds.ToDevice(DataDevice::CPU); - inputIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)ret})); - attentionMask = Data(); - positionIds.CopyFrom(Data(DataType::FLOAT32, {1, 1}, {(float)len})); - //if (do_sample) { - // tokenPenaltyManager.InsertToken(ret); - //} - len++; - if (index == generationConfig.output_token_limit) { - break; - } - - //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now())); - } - if (retCb) -#ifdef PY_API - { - if (generationConfig.enable_hash_id) { - std::stringstream ss; - ss << retString << "hash_id:" << hash_id; - retCb(-1, pybind11::bytes(ss.str())); - } else { - retCb(-1, pybind11::bytes(retString)); - } - } -#else - retCb(-1, retString.c_str()); -#endif - - return retString; - } - - void MiniCpmModel::ResponseBatch(const std::vector &inputs, std::vector &outputs, - RuntimeResultBatch retCb, - const GenerationConfig &generationConfig) { -#ifdef USE_CUDA - FastllmCudaClearBigBuffer(); -#endif -#ifdef PY_API - std::vector prompts; - std::vector < size_t > hash_ids; - for (auto _input: inputs){ - size_t hash_id = std::hash{}(_input); - hash_ids.push_back(hash_id); - - size_t pos = _input.rfind("time_stamp:"); - std::string prompt = (generationConfig.enable_hash_id && pos != -1) ? _input.substr(0, pos) : _input; - prompts.push_back(prompt); - } -#else - std::vector prompts = inputs; -#endif - int batch = prompts.size(); - outputs.clear(); - outputs.resize(batch, ""); - - std::vector inputTokens; - std::vector seqLens; - inputTokens.resize(batch); - seqLens.resize(batch); - int maxLen = 0; - for (int i = 0; i < batch; i++) { - inputTokens[i].CopyFrom(this->weight.tokenizer.Encode(prompts[i])); - maxLen = std::max(maxLen, (int)inputTokens[i].Count(0)); - seqLens[i] = (int)inputTokens[i].Count(0); - } - - std::vector ids = std::vector (batch * maxLen, 0); - std::vector vpids = std::vector (batch * maxLen, 0); - std::vector vmask = std::vector (batch * maxLen * maxLen, 0); - for (int i = 0; i < batch; i++) { - Data &tokens = inputTokens[i]; - int len = tokens.Count(0), base = maxLen - len; - for (int j = 0; j < len; j++) { - ids[i * maxLen + base + j] = ((float*)tokens.cpuData)[j]; - } - for (int j = 0; j < len; j++) { - vpids[i * maxLen + base + j] = j; - } - - std::fill(vmask.data() + i * maxLen * maxLen, - vmask.data() + i * maxLen * maxLen + (maxLen - len) * maxLen, 1.0); - for (int j = maxLen - len; j < maxLen; j++) { - std::fill(vmask.data() + i * maxLen * maxLen + j * maxLen, - vmask.data() + i * maxLen * maxLen + j * maxLen + maxLen - len, 1.0); - } - for (int j = 0; j < len; j++) { - for (int k = j + 1; k < len; k++) { - vmask[i * maxLen * maxLen + (base + j) * maxLen + base + k] = 1; - } - } - } - - Data inputIds = Data(DataType::FLOAT32, {batch, maxLen}, ids); - Data attentionMask = Data(DataType::FLOAT32, {batch, maxLen, maxLen}, vmask); - Data positionIds = Data(DataType::FLOAT32, {batch, maxLen}, vpids); - - std::vector > pastKeyValues; - for (int i = 0; i < block_cnt; i++) { - pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32), - Data(DataType::FLOAT32))); - } - - std::string retString = ""; - std::vector lens = seqLens; - std::vector isEnding = std::vector (batch, false); - std::vector results; - int index = 0; - - LastTokensManager tokensManager (batch, generationConfig.last_n); - while (true) { - auto st = std::chrono::system_clock::now(); - std::vector ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues, - generationConfig, tokensManager); - for (int i = 0; i < batch; i++) { - tokensManager.units[i].Push(ret[i]); - } - std::vector fret; - std::vector results; - int endingCount = 0; - std::vector curStrings; - for (int i = 0; i < batch; i++) { - fret.push_back(ret[i]); - if (ret[i] == eos_token_id) { - isEnding[i] = true; - } - if (isEnding[i]) { - curStrings.push_back(""); - endingCount++; - continue; - } - results.push_back(ret[i]); - std::string curString = weight.tokenizer.Decode( - Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str(); - outputs[i] += curString; - curStrings.push_back(curString); - results.clear(); - } - - if (endingCount == batch) { - break; - } - if (retCb) -#ifdef PY_API - { - if (generationConfig.enable_hash_id) { - std::vector rtnStrings; - for (size_t i=0; i rtnStrings; - for (size_t i=0; i pids = std::vector (batch); - std::vector vmasks = std::vector (batch * maxLen, 0.0f); - for (int i = 0; i < batch; i++) { - pids[i] = lens[i]; - lens[i]++; - for (int j = 0; j < maxLen - lens[i]; j++) { - vmasks[i * maxLen + j] = 1.0f; - } - } - positionIds.ToDevice(DataDevice::CPU); - attentionMask.ToDevice(DataDevice::CPU); - attentionMask.CopyFrom(Data(DataType::FLOAT32, {batch, 1, maxLen}, vmasks)); - inputIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, fret)); - positionIds.CopyFrom(Data(DataType::FLOAT32, {batch, 1}, pids)); - if (index == generationConfig.output_token_limit) { - break; - } - - //printf("spend %f s.\n", GetSpan(st, std::chrono::system_clock::now())); - } - if (retCb) -#ifdef PY_API - { - if (generationConfig.enable_hash_id) { - std::vector rtnStrings; - for (size_t i=0; i rtnStrings; - for (size_t i=0; i > pastKeyValues; - for (int i = 0; i < block_cnt; i++) { - pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32), - Data(DataType::FLOAT32))); - } - Forward(inputIds, attentionMask, positionIds, pastKeyValues); - printf("finish.\n"); - } - - int MiniCpmModel::LaunchResponseTokens(const std::vector &inputTokens, - const GenerationConfig &generationConfig) { - mainLoopLocker.lock(); - - if (mainLoop == nullptr) { - if (mainLoop == nullptr) { - mainLoop = new std::thread([](MiniCpmModel *model) { - while (true) { - std::vector attentionMasks; - std::vector positionIds; - std::vector > pastKeyValues; - std::vector ids; - std::vector seqLens; - std::vector generationConfigs; - LastTokensManager tokensManager; - std::vector * > logits; - model->dictLocker.lock(); - for (auto &it: model->responseContextDict.dicts) { - if (it.second->isEnding) { - continue; - } - generationConfigs.push_back(it.second->generationConfig); - if (it.second->generationConfig.output_logits) { - it.second->resultLogits.push(new std::vector ()); - logits.push_back(it.second->resultLogits.back()); - } else { - logits.push_back(nullptr); - } - tokensManager.units.push_back(it.second->tokens); - if (it.second->preTokens == 0) { - int seqLen = it.second->currentTokens.size(); - for (int i = 0; i < it.second->currentTokens.size(); i++) { - ids.push_back(it.second->currentTokens[i]); - } - - seqLens.push_back(seqLen); - - std::vector vmask = std::vector (seqLen * seqLen, 0); - std::vector vpids = std::vector (seqLen, 0); - for (int i = 0; i < seqLen; i++) { - vpids[i] = i; - for (int j = i + 1; j < seqLen; j++) { - vmask[i * seqLen + j] = 1; - } - } - it.second->intParams["len"] = seqLen; - - attentionMasks.push_back(new Data(DataType::FLOAT32, {seqLen, seqLen}, vmask)); - positionIds.push_back(new Data(DataType::FLOAT32, {2, seqLen}, vpids)); - } else { - int ret = it.second->currentTokens[0]; - seqLens.push_back(1); - ids.push_back(ret); - attentionMasks.push_back(nullptr); - positionIds.push_back(new Data(DataType::FLOAT32, {1, 1}, {(float)it.second->intParams["len"]})); - it.second->intParams["len"]++; - } - - it.second->preTokens += seqLens.back(); - for (int i = 0; i < model->block_cnt; i++) { - pastKeyValues.push_back(std::make_pair(&it.second->pastKeyValues[i].first, - &it.second->pastKeyValues[i].second)); - } - } - - if (seqLens.size() > 0) { - model->dictLocker.unlock(); -#ifdef USE_CUDA - FastllmCudaClearBigBuffer(); -#endif - Data inputIds = Data(DataType::FLOAT32, {1, (int) ids.size()}, ids); - std::vector ret; - ret = model->ForwardBatch(seqLens.size(), inputIds, attentionMasks, - positionIds, seqLens, pastKeyValues, generationConfigs, - tokensManager, &logits); - model->dictLocker.lock(); - int idx = 0; - for (auto &it: model->responseContextDict.dicts) { - if (it.second->isEnding) { - continue; - } - int curRet = ret[idx++]; - if (curRet == model->eos_token_id) { - it.second->isEnding = true; - } else { - auto itStopTk = it.second->generationConfig.stop_token_ids.find(curRet); - if (itStopTk != it.second->generationConfig.stop_token_ids.end()) { - it.second->isEnding = true; - } - } - if (it.second->isEnding == false) { - it.second->currentTokens = std::vector{curRet}; - it.second->resultTokenQueue.push(curRet); - it.second->tokens.Push(curRet); - it.second->curTokens++; - if (it.second->curTokens == it.second->generationConfig.output_token_limit) { - it.second->isEnding = true; - } - } - } - } - - for (int i = 0; i < attentionMasks.size(); i++) { - delete attentionMasks[i]; - } - for (int i = 0; i < positionIds.size(); i++) { - delete positionIds[i]; - } - - model->dictLocker.unlock(); - MySleep(0); - } - }, this); - } - } - mainLoopLocker.unlock(); - - dictLocker.lock(); - int handleId = responseContextDict.CreateHandle(); - ResponseContext *context = responseContextDict.GetHandle(handleId); - context->Init(this->block_cnt); - - context->currentTokens = inputTokens; - context->currentTokens.insert(context->currentTokens.begin(), this->bos_token_id); - context->generationConfig = generationConfig; - context->tokens = LastTokensUnit(generationConfig.last_n); - dictLocker.unlock(); - return handleId; - } - - int MiniCpmModel::FetchResponseTokens(int handleId) { - dictLocker.lock(); - ResponseContext *context = responseContextDict.GetHandle(handleId); - if (context == nullptr) { - dictLocker.unlock(); - return -1; - } else { - while (true) { - if (context->resultTokenQueue.size() > 0) { - int ret = context->resultTokenQueue.front(); - context->resultTokenQueue.pop(); - dictLocker.unlock(); - return ret; - } else { - if (context->isEnding) { - responseContextDict.RemoveHandle(handleId); - dictLocker.unlock(); - return -1; - } - } - dictLocker.unlock(); - MySleep(0); - dictLocker.lock(); - } - } - } } diff --git a/tools/scripts/minicpm2flm.py b/tools/scripts/minicpm2flm.py index 095ea51..e434355 100644 --- a/tools/scripts/minicpm2flm.py +++ b/tools/scripts/minicpm2flm.py @@ -5,7 +5,7 @@ if __name__ == "__main__": modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "openbmb/MiniCPM-2B-dpo-fp16" - tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(modelNameOrPath, use_fast=False, trust_remote_code=True) # `torch_dtype=torch.float16` is set by default, if it will not cause an OOM Error, you can load model in float32. model = AutoModelForCausalLM.from_pretrained(modelNameOrPath, trust_remote_code=True, torch_dtype=torch.float16) model = model.eval() @@ -14,6 +14,6 @@ dtype = sys.argv[2] if len(sys.argv) >= 3 else "float16" exportPath = sys.argv[1] if len(sys.argv) >= 2 else "minicpm-2b-" + dtype + ".flm" - torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", + torch2flm.tofile(exportPath, model, tokenizer, pre_prompt = "", user_role = "<用户>", bot_role = "", history_sep = "", dtype = dtype)