diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb7a03b..7f11ffed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,8 +33,8 @@ endif() message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS}) set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/device.cpp src/model.cpp src/executor.cpp - src/devices/cpu/cpudevice.cpp src/devices/cpu/cpudevicebatch.cpp src/models/minicpm.cpp - src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp src/models/glm.cpp) + src/devices/cpu/cpudevice.cpp src/devices/cpu/cpudevicebatch.cpp + src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp src/models/glm.cpp src/models/minicpm.cpp) include_directories(include) include_directories(include/utils) diff --git a/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt b/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt index fa65ccb0..52168407 100644 --- a/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt +++ b/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt @@ -32,11 +32,13 @@ set(PROJECT_SOURCE ../../../../../../../src/executor.cpp ../../../../../../../src/devices/cpu/cpudevice.cpp ../../../../../../../src/devices/cpu/cpudevicebatch.cpp + ../../../../../../../src/models/basellm.cpp ../../../../../../../src/models/chatglm.cpp ../../../../../../../src/models/moss.cpp ../../../../../../../src/models/llama.cpp - ../../../../../../../src/models/basellm.cpp ../../../../../../../src/models/qwen.cpp + ../../../../../../../src/models/glm.cpp + ../../../../../../../src/models/minicpm.cpp ) include_directories( diff --git a/example/Win32Demo/fastllm-gpu.vcxproj b/example/Win32Demo/fastllm-gpu.vcxproj index 7bba9281..34296657 100644 --- a/example/Win32Demo/fastllm-gpu.vcxproj +++ b/example/Win32Demo/fastllm-gpu.vcxproj @@ -1,223 +1,230 @@ - - - - - Debug - Win32 - - - Release - Win32 - - - Debug - x64 - - - Release - x64 - - - - {BDA13DDF-572F-4FAD-B7A9-80EA5CAC3F2B} - Win32Proj - fastllm - 8.1 - - - - StaticLibrary - true - v140 - Unicode - - - StaticLibrary - false - v140 - true - Unicode - - - StaticLibrary - true - v140 - Unicode - - - StaticLibrary - false - v140 - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(CUDA_PATH)\lib\Win32;$(LibraryPath) - $(CUDA_PATH)\include;$(IncludePath) - $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - true - $(CUDA_PATH)\include;$(IncludePath) - $(CUDA_PATH)\lib\x64;$(LibraryPath) - - - false - $(CUDA_PATH)\lib\Win32;$(LibraryPath) - $(CUDA_PATH)\include;$(IncludePath) - $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - false - $(CUDA_PATH)\include;$(IncludePath) - $(CUDA_PATH)\lib\x64;$(LibraryPath) - - - - - - Level3 - Disabled - NOMINMAX;USE_CUDA;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) - MultiThreadedDebug - $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) - /arch:AVX /source-charset:utf-8 %(AdditionalOptions) - - - cudart.lib;cublas.lib;%(AdditionalDependencies) - Windows - - - compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) - - - - - - - Level3 - Disabled - NOMINMAX;USE_CUDA;WIN64;__AVX__;__AVX2__;_DEBUG;_LIB;%(PreprocessorDefinitions) - $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) - MultiThreadedDebug - /arch:AVX /source-charset:utf-8 %(AdditionalOptions) - - - cudart.lib;cublas.lib;%(AdditionalDependencies) - Windows - - - compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) - - - - - Level3 - - - MaxSpeed - true - true - NOMINMAX;USE_CUDA;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - MultiThreaded - $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) - /arch:AVX /source-charset:utf-8 %(AdditionalOptions) - - - cudart.lib;cublas.lib;%(AdditionalDependencies) - Windows - true - true - - - compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) - - - - - Level3 - - - MaxSpeed - true - true - NOMINMAX;USE_CUDA;__AVX__;__AVX2__;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) - MultiThreaded - /arch:AVX /source-charset:utf-8 %(AdditionalOptions) - - - cudart.lib;cublas.lib;%(AdditionalDependencies) - Windows - true - true - - - compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Document - - - Document - - - + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {BDA13DDF-572F-4FAD-B7A9-80EA5CAC3F2B} + Win32Proj + fastllm + 8.1 + + + + StaticLibrary + true + v140 + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + StaticLibrary + true + v140 + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + + + + + + + + + + + + + + + + + true + $(CUDA_PATH)\lib\Win32;$(LibraryPath) + $(CUDA_PATH)\include;$(IncludePath) + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + true + $(CUDA_PATH)\include;$(IncludePath) + $(CUDA_PATH)\lib\x64;$(LibraryPath) + + + false + $(CUDA_PATH)\lib\Win32;$(LibraryPath) + $(CUDA_PATH)\include;$(IncludePath) + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + false + $(CUDA_PATH)\include;$(IncludePath) + $(CUDA_PATH)\lib\x64;$(LibraryPath) + + + + + + Level3 + Disabled + NOMINMAX;USE_CUDA;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreadedDebug + $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + /arch:AVX /source-charset:utf-8 %(AdditionalOptions) + + + cudart.lib;cublas.lib;%(AdditionalDependencies) + Windows + + + compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) + true + + + + + + + Level3 + Disabled + NOMINMAX;USE_CUDA;WIN64;__AVX__;__AVX2__;_DEBUG;_LIB;%(PreprocessorDefinitions) + $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + MultiThreadedDebug + /arch:AVX /source-charset:utf-8 %(AdditionalOptions) + + + cudart.lib;cublas.lib;%(AdditionalDependencies) + Windows + + + compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) + 64 + true + + + + + Level3 + + + MaxSpeed + true + true + NOMINMAX;USE_CUDA;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + MultiThreaded + $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + /arch:AVX /source-charset:utf-8 %(AdditionalOptions) + + + cudart.lib;cublas.lib;%(AdditionalDependencies) + Windows + true + true + + + compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) + true + + + + + Level3 + + + MaxSpeed + true + true + NOMINMAX;USE_CUDA;__AVX__;__AVX2__;WIN64;NDEBUG;_LIB;%(PreprocessorDefinitions) + $(ProjectDir)..\..\include;$(ProjectDir)..\..\include\devices;$(ProjectDir)..\..\include\devices\cpu;$(ProjectDir)..\..\include\devices\cuda;$(ProjectDir)..\..\include\models;$(ProjectDir)..\..\include\utils;$(CUDA_PATH)\include;%(AdditionalIncludeDirectories) + MultiThreaded + /arch:AVX /source-charset:utf-8 %(AdditionalOptions) + + + cudart.lib;cublas.lib;%(AdditionalDependencies) + Windows + true + true + + + compute_61,sm_61;compute_75,sm_75;compute_86,sm_86;%(CodeGeneration) + true + 64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Document + + + Document + + + \ No newline at end of file diff --git a/example/Win32Demo/fastllm-gpu.vcxproj.filters b/example/Win32Demo/fastllm-gpu.vcxproj.filters index 9e4980b2..7e72d873 100644 --- a/example/Win32Demo/fastllm-gpu.vcxproj.filters +++ b/example/Win32Demo/fastllm-gpu.vcxproj.filters @@ -69,6 +69,9 @@ 头文件\models + + 头文件\models + 头文件\models @@ -122,6 +125,9 @@ 源文件\models + + 源文件\models + 源文件\models diff --git a/example/Win32Demo/fastllm.vcxproj b/example/Win32Demo/fastllm.vcxproj index 41168e5a..e5e33c3a 100644 --- a/example/Win32Demo/fastllm.vcxproj +++ b/example/Win32Demo/fastllm.vcxproj @@ -181,6 +181,7 @@ + @@ -197,6 +198,7 @@ + diff --git a/example/Win32Demo/fastllm.vcxproj.filters b/example/Win32Demo/fastllm.vcxproj.filters index b924fa20..71216a96 100644 --- a/example/Win32Demo/fastllm.vcxproj.filters +++ b/example/Win32Demo/fastllm.vcxproj.filters @@ -63,6 +63,9 @@ 头文件\models + + 头文件\models + 头文件\models @@ -110,6 +113,9 @@ 源文件\models + + 源文件\models + 源文件\models diff --git a/include/models/minicpm.h b/include/models/minicpm.h index 241e6c72..b1849b6a 100644 --- a/include/models/minicpm.h +++ b/include/models/minicpm.h @@ -15,6 +15,8 @@ namespace fastllm { public: MiniCpmModel(); // 构造函数 + virtual void InitParams(); // 初始化参数信息 + // 推理 virtual int Forward( const Data &inputIds, @@ -65,6 +67,13 @@ namespace fastllm { virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history + + private: + float embed_scale = 1.f; + + float attention_scale = 1.f / std::sqrt(block_cnt); + + float rms_scale = 1.f / 4096.f; }; } diff --git a/src/model.cpp b/src/model.cpp index d454291a..82566dfe 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -108,7 +108,7 @@ namespace fastllm { model = new LlamaModel(); model->model_type = "qwen"; } else if (modelType=="minicpm") { - model = (basellm*)(new MiniCpmModel()); + model = new MiniCpmModel(); } else if (modelType == "qwen") { model = (basellm *) (new QWenModel()); model->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN; diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 7085b7a6..1ee4aa37 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -47,12 +47,6 @@ namespace fastllm { MiniCpmModel::MiniCpmModel() { this->model_type = "minicpm"; - // 默认使用alpaca的提示词和instruction - /* - this->pre_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"; - this->user_role = "### Instruction:\n"; - this->bot_role = "\n\n### Response:"; - */ this->history_sep = ""; this->pre_prompt = ""; this->user_role = ""; @@ -87,6 +81,21 @@ namespace fastllm { weight.embeddingNames.insert("model.embed_tokens.weight"); } + void MiniCpmModel::InitParams() { + basellm::InitParams(); + if (this->weight.dicts.find("scale_emb") != this->weight.dicts.end()) { + this->embed_scale = std::stof(this->weight.dicts["scale_emb"]); + } + if (this->weight.dicts.find("scale_depth") != this->weight.dicts.end()) { + float scale_depth = std::stof(this->weight.dicts["scale_depth"]); + this->attention_scale = scale_depth / std::sqrt(block_cnt); + } + if (this->weight.dicts.find("dim_model_base") != this->weight.dicts.end()) { + int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]); + this->rms_scale = 1.f / (this->embed_dim / dim_model_base); + } + } + int MiniCpmModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask, const fastllm::Data &positionIds, std::vector> &pastKeyValues, const GenerationConfig &generationConfig, const LastTokensManager &lastTokens, @@ -105,15 +114,8 @@ namespace fastllm { Data attenLastOutput; Data w1, w2, w3; - float scale_emb = std::stof(this->weight.dicts["scale_emb"]); - float scale_depth = std::stof(this->weight.dicts["scale_depth"]); - int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]); - int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]); - int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]); - float rms_scale = 1.f / (dim_model / dim_model_base); - Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates); - Mul(hiddenStates, scale_emb, hiddenStates); + Mul(hiddenStates, embed_scale, hiddenStates); for (int i = 0; i < block_cnt; i++) { ApplyDeviceMap(this->deviceMap, i + 1, block_cnt); RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"], @@ -213,10 +215,8 @@ namespace fastllm { attenOutput.Reshape({bsz, seqlen, -1}); Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - - Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput); + Mul(attenLastOutput, this->attention_scale, attenLastOutput); AddTo(hiddenStates, attenLastOutput); - // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1); @@ -224,7 +224,7 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2); + Mul(w2, this->attention_scale, w2); AddTo(hiddenStates, w2); } Data logits, topk; @@ -241,8 +241,8 @@ namespace fastllm { { auto &hiddenStates = *lastHiddenStates; RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates); - Mul(hiddenStates, rms_scale, hiddenStates); - Linear(hiddenStates, weight["model.embed_tokens.weight"], Data(), logits); + Mul(hiddenStates, this->rms_scale, hiddenStates); + Linear(hiddenStates, weight["lm_head.weight"], Data(), logits); if (generationConfig.output_logits && retLogits != nullptr) { int size = logits.dims.back(); logits.ToDevice(DataDevice::CPU); @@ -278,16 +278,9 @@ namespace fastllm { Data attenWeights, attenOutput; Data attenLastOutput; Data w1, w2, w3; - - float scale_emb = std::stof(this->weight.dicts["scale_emb"]); - float scale_depth = std::stof(this->weight.dicts["scale_depth"]); - int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]); - int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]); - int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]); - float rms_scale = 1.f / (dim_model / dim_model_base); Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates); - Mul(hiddenStates, scale_emb, hiddenStates); + Mul(hiddenStates, embed_scale, hiddenStates); int seqlen = hiddenStates.dims[1]; for (int i = 0; i < block_cnt; i++) { ApplyDeviceMap(this->deviceMap, i + 1, block_cnt); @@ -391,7 +384,7 @@ namespace fastllm { PermuteSelf(attenOutput, {1, 0, 2}); Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput); + Mul(attenLastOutput, this->attention_scale, attenLastOutput); AddTo(hiddenStates, attenLastOutput); // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); @@ -400,7 +393,7 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2); + Mul(w2, this->attention_scale, w2); AddTo(hiddenStates, w2); } @@ -418,7 +411,7 @@ namespace fastllm { { auto &hiddenStates = *lastHiddenStates; RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates); - Mul(hiddenStates, rms_scale, hiddenStates); + Mul(hiddenStates, this->rms_scale, hiddenStates); Linear(hiddenStates, weight["lm_head.weight"], Data(), logits); if (generationConfig.IsSimpleGreedy()) { TopK(logits, topk, 1); @@ -459,15 +452,8 @@ namespace fastllm { Data attenLastOutput; Data w1, w2, w3; - float scale_emb = std::stof(this->weight.dicts["scale_emb"]); - float scale_depth = std::stof(this->weight.dicts["scale_depth"]); - int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]); - int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]); - int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]); - float rms_scale = 1.f / (dim_model / dim_model_base); - Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates); - Mul(hiddenStates, scale_emb, hiddenStates); + Mul(hiddenStates, embed_scale, hiddenStates); int seqlen = hiddenStates.dims[1]; for (int i = 0; i < block_cnt; i++) { ApplyDeviceMap(this->deviceMap, i + 1, block_cnt); @@ -594,9 +580,8 @@ namespace fastllm { } Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput); - Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput); + Mul(attenLastOutput, this->attention_scale, attenLastOutput); AddTo(hiddenStates, attenLastOutput); - // 2. mlp RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput); Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1); @@ -604,13 +589,13 @@ namespace fastllm { Silu(w1, w1); MulTo(w1, w3); Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2); - Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2); + Mul(w2, this->attention_scale, w2); AddTo(hiddenStates, w2); } Data logits, curLogit; RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates); - Mul(hiddenStates, rms_scale, hiddenStates); + Mul(hiddenStates, this->rms_scale, hiddenStates); Linear(hiddenStates, weight["lm_head.weight"], Data(), logits); std::vector lastRet; int total = 0;