Skip to content

Commit

Permalink
Merge pull request #438 from TylunasLi/minicpm
Browse files Browse the repository at this point in the history
修复Win32Demo CPU构建错误
  • Loading branch information
ztxz16 committed Mar 20, 2024
2 parents 808efe8 + 7373729 commit 1a6c7a2
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 613 deletions.
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,33 +370,38 @@ python3 tools/moss_export.py moss-int4.flm int4 #导出int4模型
一些模型的转换可以[参考这里的例子](docs/llama_cookbook.md)

#### QWEN模型导出
* **Qwen**
```sh
# 需要先安装QWen环境
# 如果使用自己finetune的模型需要修改qwen2flm.py文件中创建tokenizer, model的代码
# 根据所需的精度,导出相应的模型
cd build
python3 tools/qwen2flm.py qwen-7b-fp16.flm float16 #导出float16模型
python3 tools/qwen2flm.py qwen-7b-int8.flm int8 #导出int8模型
python3 tools/qwen2flm.py qwen-7b-int4.flm int4 #导出int4模型
```

#### MINICPM模型导出
```sh
cd build
python tools/minicpm2flm.py #导出float16模型
./main -p minicpm-2b-float16.flm # 执行模型
```

* **Qwen1.5**

```sh
# 需要先安装QWen2环境(transformers >= 4.37.0)
# 根据所需的精度,导出相应的模型
cd build
python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" #导出wen1.5-4B-Chat float16模型
python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" #导出Qwen1.5-7B-Chat int8模型
python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" #导出Qwen1.5-14B-Chat int4模型
# 最后一个参数可替换为模型路径
```

#### MINICPM模型导出
```sh
# 需要先安装MiniCPM环境(transformers >= 4.36.0)
# 默认脚本导出iniCPM-2B-dpo-fp16模型
cd build
python tools/minicpm2flm.py minicpm-2b-float16.flm #导出dpo-float16模型
./main -p minicpm-2b-float16.flm # 执行模型
```

## 开发计划

也就是俗称的画饼部分,大家如果有需要的功能可以在讨论区提出
Expand Down
3 changes: 1 addition & 2 deletions example/Win32Demo/fastllm.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@
<AdditionalOptions>/arch:AVX /source-charset:utf-8 %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>
<AdditionalDependencies>cudart.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
<SubSystem>Windows</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
Expand All @@ -181,7 +180,7 @@
<ClInclude Include="..\..\include\models\factoryllm.h" />
<ClInclude Include="..\..\include\models\glm.h" />
<ClInclude Include="..\..\include\models\llama.h" />
<ClCompile Include="..\..\include\models\minicpm.h" />
<ClInclude Include="..\..\include\models\minicpm.h" />
<ClInclude Include="..\..\include\models\moss.h" />
<ClInclude Include="..\..\include\models\qwen.h" />
<ClInclude Include="..\..\include\utils\armMath.h" />
Expand Down
21 changes: 13 additions & 8 deletions include/models/basellm.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#pragma once

#ifndef FASTLLM_BASELLM_H
#define FASTLLM_BASELLM_H

#include "fastllm.h"

#include <thread>
Expand Down Expand Up @@ -50,9 +53,9 @@ namespace fastllm {
this->weight.ReleaseWeight();
};

virtual void LoadFromFile(const std::string &fileName); // 从文件读取
virtual void LoadFromFile(const std::string &fileName); // 从文件读取

virtual void InitParams(); // 初始化参数信息
virtual void InitParams(); // 初始化参数信息

// 推理
virtual int Forward(
Expand Down Expand Up @@ -85,12 +88,12 @@ namespace fastllm {
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);

// 根据输入的tokens生成LLM推理的输入
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputs(std::vector <std::vector <float> > &inputTokens,
const std::map <std::string, int> &params,
Data &inputIds, Data &attentionMask, Data &positionIds);

// 根据输入的tokens生成LLM推理的输入
// 根据输入的tokens生成LLM推理的输入
virtual void FillLLMInputsBatch(std::vector <std::vector <float> > &inputTokens,
const std::vector <std::map <std::string, int> > &params,
Data &inputIds, Data &attentionMask, Data &positionIds);
Expand All @@ -102,16 +105,16 @@ namespace fastllm {
virtual void ResponseBatch(const std::vector<std::string> &inputs,
std::vector<std::string> &outputs,
RuntimeResultBatch retCb = nullptr,
const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复
const GenerationConfig &generationConfig = GenerationConfig()); // 批量根据给出的内容回复

virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId

virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了
virtual int FetchResponseTokens(int handleId); // 获取指定handle的输出, -1代表输出结束了

virtual int FetchResponseLogits(int handleId, std::vector <float> &logits); // 获取指定handle的输出Logits

virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型
virtual void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型

virtual void SaveModel(const std::string &fileName); // 直接导出

Expand Down Expand Up @@ -158,3 +161,5 @@ namespace fastllm {
int tokensLimit = -1;
};
}

#endif //FASTLLM_BASELLM_H
23 changes: 2 additions & 21 deletions include/models/minicpm.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
#define FASTLLM_MINICPM_H

#include "basellm.h"
#include "llama.h"
#include "cmath"

#include <iostream>

namespace fastllm {
class MiniCpmModel: public basellm {
class MiniCpmModel: public LlamaModel {
public:
MiniCpmModel(); // 构造函数

Expand Down Expand Up @@ -48,26 +49,6 @@ namespace fastllm {
const LastTokensManager &lastTokens = LastTokensManager(),
std::vector <std::vector <float>*> *logits = nullptr);

virtual std::string Response(const std::string& input,
RuntimeResult retCb,
const GenerationConfig &generationConfig = GenerationConfig()); // 根据给出的内容回复

virtual void ResponseBatch(const std::vector <std::string> &inputs,
std::vector <std::string> &outputs,
RuntimeResultBatch retCb,
const GenerationConfig &generationConfig = GenerationConfig());

virtual int LaunchResponseTokens(const std::vector <int> &inputTokens,
const GenerationConfig &generationConfig = GenerationConfig()); // 启动一个response任务,返回分配的handleId

virtual int FetchResponseTokens(int handelId); // 获取指定handle的输出, -1代表输出结束了

virtual void WarmUp(); // 预热

virtual std::string MakeInput(const std::string &history, int round, const std::string &input); // 根据历史信息和当前输入生成prompt

virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output); // 根据当前回复更新history

private:
float embed_scale = 1.f;

Expand Down
Loading

0 comments on commit 1a6c7a2

Please sign in to comment.