Merge pull request #428 from TylunasLi/minicpm

MiniCPM模型Win32Demo工程编译、GPU执行问题修复
ztxz16 · Mar 4, 2024 · a0f2212 · a0f2212
2 parents bf9c340 + cc9552f
commit a0f2212
Show file tree

Hide file tree

Showing 9 changed files with 286 additions and 269 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,8 +33,8 @@ endif()
 
 message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS})
 set(FASTLLM_CXX_SOURCES src/fastllm.cpp src/device.cpp src/model.cpp src/executor.cpp
-        src/devices/cpu/cpudevice.cpp src/devices/cpu/cpudevicebatch.cpp src/models/minicpm.cpp
-        src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp src/models/glm.cpp)
+        src/devices/cpu/cpudevice.cpp src/devices/cpu/cpudevicebatch.cpp
+        src/models/chatglm.cpp src/models/moss.cpp src/models/llama.cpp src/models/qwen.cpp src/models/basellm.cpp src/models/glm.cpp src/models/minicpm.cpp)
 
 include_directories(include)
 include_directories(include/utils)

diff --git a/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt b/example/Android/LLMAssistant/app/src/main/cpp/CMakeLists.txt
@@ -32,11 +32,13 @@ set(PROJECT_SOURCE
         ../../../../../../../src/executor.cpp
         ../../../../../../../src/devices/cpu/cpudevice.cpp
         ../../../../../../../src/devices/cpu/cpudevicebatch.cpp
+        ../../../../../../../src/models/basellm.cpp
         ../../../../../../../src/models/chatglm.cpp
         ../../../../../../../src/models/moss.cpp
         ../../../../../../../src/models/llama.cpp
-        ../../../../../../../src/models/basellm.cpp
         ../../../../../../../src/models/qwen.cpp
+        ../../../../../../../src/models/glm.cpp
+        ../../../../../../../src/models/minicpm.cpp
         )
 
 include_directories(

diff --git a/example/Win32Demo/fastllm-gpu.vcxproj b/example/Win32Demo/fastllm-gpu.vcxproj
diff --git a/example/Win32Demo/fastllm-gpu.vcxproj.filters b/example/Win32Demo/fastllm-gpu.vcxproj.filters
@@ -69,6 +69,9 @@
     <ClInclude Include="..\..\include\models\llama.h">
       <Filter>头文件\models</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\include\models\minicpm.h">
+      <Filter>头文件\models</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\include\models\moss.h">
       <Filter>头文件\models</Filter>
     </ClInclude>
@@ -122,6 +125,9 @@
     <ClCompile Include="..\..\src\models\llama.cpp">
       <Filter>源文件\models</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\models\minicpm.cpp">
+      <Filter>源文件\models</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\src\models\moss.cpp">
       <Filter>源文件\models</Filter>
     </ClCompile>

diff --git a/example/Win32Demo/fastllm.vcxproj b/example/Win32Demo/fastllm.vcxproj
@@ -181,6 +181,7 @@
     <ClInclude Include="..\..\include\models\factoryllm.h" />
     <ClInclude Include="..\..\include\models\glm.h" />
     <ClInclude Include="..\..\include\models\llama.h" />
+    <ClCompile Include="..\..\include\models\minicpm.h" />
     <ClInclude Include="..\..\include\models\moss.h" />
     <ClInclude Include="..\..\include\models\qwen.h" />
     <ClInclude Include="..\..\include\utils\armMath.h" />
@@ -197,6 +198,7 @@
     <ClCompile Include="..\..\src\models\chatglm.cpp" />
     <ClCompile Include="..\..\src\models\glm.cpp" />
     <ClCompile Include="..\..\src\models\llama.cpp" />
+    <ClCompile Include="..\..\src\models\minicpm.cpp" />
     <ClCompile Include="..\..\src\models\moss.cpp" />
     <ClCompile Include="..\..\src\models\qwen.cpp" />
     <ClCompile Include="..\..\src\pybinding.cpp" />

diff --git a/example/Win32Demo/fastllm.vcxproj.filters b/example/Win32Demo/fastllm.vcxproj.filters
@@ -63,6 +63,9 @@
     <ClInclude Include="..\..\include\models\llama.h">
       <Filter>头文件\models</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\include\models\minicpm.h">
+      <Filter>头文件\models</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\include\models\moss.h">
       <Filter>头文件\models</Filter>
     </ClInclude>
@@ -110,6 +113,9 @@
     <ClCompile Include="..\..\src\models\llama.cpp">
       <Filter>源文件\models</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\models\minicpm.cpp">
+      <Filter>源文件\models</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\src\models\moss.cpp">
       <Filter>源文件\models</Filter>
     </ClCompile>

diff --git a/include/models/minicpm.h b/include/models/minicpm.h
@@ -15,6 +15,8 @@ namespace fastllm {
     public:
         MiniCpmModel(); // 构造函数
 
+        virtual void InitParams(); // 初始化参数信息
+
         // 推理
         virtual int Forward(
                 const Data &inputIds,
@@ -65,6 +67,13 @@ namespace fastllm {
         virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt
 
         virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history
+
+    private:
+        float embed_scale = 1.f;
+
+        float attention_scale = 1.f / std::sqrt(block_cnt);
+
+        float rms_scale = 1.f / 4096.f;
     };
 }
 

diff --git a/src/model.cpp b/src/model.cpp
@@ -108,7 +108,7 @@ namespace fastllm {
             model = new LlamaModel();
             model->model_type = "qwen";
         } else if (modelType=="minicpm") {
-	          model = (basellm*)(new MiniCpmModel());
+            model = new MiniCpmModel();
         } else if (modelType == "qwen") {
             model = (basellm *) (new QWenModel());
             model->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN;

diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp
@@ -47,12 +47,6 @@ namespace fastllm {
     MiniCpmModel::MiniCpmModel() {
         this->model_type = "minicpm";
 
-        // 默认使用alpaca的提示词和instruction
-        /*
-        this->pre_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n";
-        this->user_role = "### Instruction:\n";
-        this->bot_role = "\n\n### Response:";
-        */
         this->history_sep = "";
         this->pre_prompt = "";
         this->user_role = "";
@@ -87,6 +81,21 @@ namespace fastllm {
         weight.embeddingNames.insert("model.embed_tokens.weight");
     }
 
+    void MiniCpmModel::InitParams() {
+        basellm::InitParams();
+        if (this->weight.dicts.find("scale_emb") != this->weight.dicts.end()) {
+            this->embed_scale = std::stof(this->weight.dicts["scale_emb"]);
+        }
+        if (this->weight.dicts.find("scale_depth") != this->weight.dicts.end()) {
+            float scale_depth = std::stof(this->weight.dicts["scale_depth"]);
+            this->attention_scale = scale_depth / std::sqrt(block_cnt);
+        }
+        if (this->weight.dicts.find("dim_model_base") != this->weight.dicts.end()) {
+            int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]);
+            this->rms_scale = 1.f / (this->embed_dim / dim_model_base);
+        }
+    }
+
     int MiniCpmModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
                             const fastllm::Data &positionIds, std::vector<std::pair<Data, Data>> &pastKeyValues,
                             const GenerationConfig &generationConfig, const LastTokensManager &lastTokens,
@@ -105,15 +114,8 @@ namespace fastllm {
         Data attenLastOutput;
         Data w1, w2, w3;
 
-        float scale_emb = std::stof(this->weight.dicts["scale_emb"]);
-        float scale_depth = std::stof(this->weight.dicts["scale_depth"]);
-        int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]);
-        int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]);
-        int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]);
-        float rms_scale = 1.f / (dim_model / dim_model_base);
-
         Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
-        Mul(hiddenStates, scale_emb, hiddenStates);
+        Mul(hiddenStates, embed_scale, hiddenStates);
         for (int i = 0; i < block_cnt; i++) {
             ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".input_layernorm.weight"],
@@ -213,18 +215,16 @@ namespace fastllm {
             attenOutput.Reshape({bsz, seqlen, -1});
 
             Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
-
-            Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput);
+            Mul(attenLastOutput, this->attention_scale, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
-
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
             Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
             Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
             Silu(w1, w1);
             MulTo(w1, w3);
             Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
-            Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2);
+            Mul(w2, this->attention_scale, w2);
             AddTo(hiddenStates, w2);
         }
         Data logits, topk;
@@ -241,8 +241,8 @@ namespace fastllm {
         {
             auto &hiddenStates = *lastHiddenStates;
             RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates);
-            Mul(hiddenStates, rms_scale, hiddenStates);
-            Linear(hiddenStates, weight["model.embed_tokens.weight"], Data(), logits);
+            Mul(hiddenStates, this->rms_scale, hiddenStates);
+            Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
             if (generationConfig.output_logits && retLogits != nullptr) {
                 int size = logits.dims.back();
                 logits.ToDevice(DataDevice::CPU);
@@ -278,16 +278,9 @@ namespace fastllm {
         Data attenWeights, attenOutput;
         Data attenLastOutput;
         Data w1, w2, w3;
-
-        float scale_emb = std::stof(this->weight.dicts["scale_emb"]);
-        float scale_depth = std::stof(this->weight.dicts["scale_depth"]);
-        int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]);
-        int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]);
-        int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]);
-        float rms_scale = 1.f / (dim_model / dim_model_base);
 
         Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
-        Mul(hiddenStates, scale_emb, hiddenStates);
+        Mul(hiddenStates, embed_scale, hiddenStates);
         int seqlen = hiddenStates.dims[1];
         for (int i = 0; i < block_cnt; i++) {
             ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
@@ -391,7 +384,7 @@ namespace fastllm {
             PermuteSelf(attenOutput, {1, 0, 2});
 
             Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
-            Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput);
+            Mul(attenLastOutput, this->attention_scale, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
@@ -400,7 +393,7 @@ namespace fastllm {
             Silu(w1, w1);
             MulTo(w1, w3);
             Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
-            Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2);
+            Mul(w2, this->attention_scale, w2);
             AddTo(hiddenStates, w2);
         }
 
@@ -418,7 +411,7 @@ namespace fastllm {
         {
             auto &hiddenStates = *lastHiddenStates;
             RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates);
-            Mul(hiddenStates, rms_scale, hiddenStates);
+            Mul(hiddenStates, this->rms_scale, hiddenStates);
             Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
             if (generationConfig.IsSimpleGreedy()) {
                 TopK(logits, topk, 1);
@@ -459,15 +452,8 @@ namespace fastllm {
         Data attenLastOutput;
         Data w1, w2, w3;
 
-        float scale_emb = std::stof(this->weight.dicts["scale_emb"]);
-        float scale_depth = std::stof(this->weight.dicts["scale_depth"]);
-        int32_t num_hidden_layers = std::stoi(this->weight.dicts["num_hidden_layers"]);
-        int32_t dim_model = std::stoi(this->weight.dicts["hidden_size"]);
-        int32_t dim_model_base = std::stoi(this->weight.dicts["dim_model_base"]);
-        float rms_scale = 1.f / (dim_model / dim_model_base);
-
         Embedding(inputIds, this->weight["model.embed_tokens.weight"], hiddenStates);
-        Mul(hiddenStates, scale_emb, hiddenStates);
+        Mul(hiddenStates, embed_scale, hiddenStates);
         int seqlen = hiddenStates.dims[1];
         for (int i = 0; i < block_cnt; i++) {
             ApplyDeviceMap(this->deviceMap, i + 1, block_cnt);
@@ -594,23 +580,22 @@ namespace fastllm {
             }
 
             Linear(attenOutput, weight[oWeightName], Data(), attenLastOutput);
-            Mul(attenLastOutput, scale_depth / std::sqrt(num_hidden_layers), attenLastOutput);
+            Mul(attenLastOutput, this->attention_scale, attenLastOutput);
             AddTo(hiddenStates, attenLastOutput);
-
             // 2. mlp
             RMSNorm(hiddenStates, this->weight["model.layers." + std::to_string(i) + ".post_attention_layernorm.weight"], 1e-5, attenInput);
             Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.gate_proj.weight"], Data(), w1);
             Linear(attenInput, weight["model.layers." + std::to_string(i) + ".mlp.up_proj.weight"], Data(), w3);
             Silu(w1, w1);
             MulTo(w1, w3);
             Linear(w1, weight["model.layers." + std::to_string(i) + ".mlp.down_proj.weight"], Data(), w2);
-            Mul(w2, scale_depth / std::sqrt(num_hidden_layers), w2);
+            Mul(w2, this->attention_scale, w2);
             AddTo(hiddenStates, w2);
         }
 
         Data logits, curLogit;
         RMSNorm(hiddenStates, weight["model.norm.weight"], 1e-5, hiddenStates);
-        Mul(hiddenStates, rms_scale, hiddenStates);
+        Mul(hiddenStates, this->rms_scale, hiddenStates);
         Linear(hiddenStates, weight["lm_head.weight"], Data(), logits);
         std::vector <int> lastRet;
         int total = 0;