Merge pull request #384 from TylunasLi/pyfastllm

修复部分系统的挂起问题，支持在Windows下使用Python模块
ztxz16 · Dec 26, 2023 · 82a3035 · 82a3035
2 parents 46a9918 + 095a191
commit 82a3035
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ fastllm是纯c++实现，无第三方依赖的高性能大模型推理库
 - 🚀 前后端分离设计，便于支持新的计算设备
 - 🚀 目前支持ChatGLM模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，MOSS模型
 
-## 两行代码加速 （测试中，暂时只支持ubuntu）
+## 两行代码加速 （测试中，暂时只支持chatglm系列）
 
 使用如下命令安装fastllm_pytools包
 
@@ -148,7 +148,7 @@ cmake .. -DUSE_CUDA=ON # 如果不使用GPU编译，那么使用 cmake .. -DUSE_
 make -j
 ```
 
-编译完成后，可以使用如下命令安装简易python工具包 (暂时只支持Linux)
+编译完成后，可以使用如下命令安装简易python工具包。
 
 ``` sh
 cd tools # 这时在fastllm/build/tools目录下
@@ -163,7 +163,7 @@ python setup.py install
 ``` sh
 # 这时在fastllm/build目录下
 
-# 命令行聊天程序, 支持打字机效果
+# 命令行聊天程序, 支持打字机效果 (只支持Linux）
 ./main -p model.flm 
 
 # 简易webui, 使用流式输出 + 动态batch，可多路并发访问
@@ -177,6 +177,8 @@ streamlit run tools/web_demo.py model.flm
 
 ```
 
+如编译中存在问题，尤其是Windows下的编译，可参考[FAQ](docs/faq.md)
+
 ### 简易python调用
 
 编译后如果安装了简易python工具包，那么可以使用python来调用一些基本的API （如果没有安装，也可以在直接import编译生成的tools/fastllm_pytools来使用)
@@ -194,7 +196,7 @@ for response in model.stream_response("你好"):
     print(response, flush = True, end = "")
 ```
 
-另外还可以设置cpu线程数等内容，详细API说明见 [fastllm_pytools](docs/fastllm_pytools)
+另外还可以设置cpu线程数等内容，详细API说明见 [fastllm_pytools](docs/fastllm_pytools.md)
 
 这个包不包含low level api，如果需要使用更深入的功能请参考 [Python绑定API](#Python绑定API)
 

diff --git a/docs/faq.md b/docs/faq.md
@@ -0,0 +1,102 @@
+# 常见问题
+
+## CMAKE
+
+### CMAKE_CUDA_ARCHITECTURES must be non-empty if set.
+
+**现象：**
+
+> CMake Error at cmake/Modules/CMakeDetermineCUDACompiler.cmake:277 (message):  
+>   CMAKE_CUDA_ARCHITECTURES must be non-empty if set.  
+> Call Stack (most recent call first):  
+>   CMakeLists.txt:39 (enable_language)
+
+**解决办法：**
+
+部分版本cmake存在该问题，需手动指定`CMAKE_CUDA_ARCHITECTURES`。执行：
+
+```shell
+cmake .. -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native
+```
+
+### Unsupported gpu architecture 'compute_native'
+
+**现象：**
+
+> nvcc fatal : Unsupported gpu architecture 'compute_native'
+
+**解决办法：**
+
+手动修改 CMakeLists.txt，根据GPU型号手动指定GPU的[Compute Capability](https://developer.nvidia.com/cuda-gpus)。如：
+
+``` diff
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -52,7 +52,7 @@
+     #message(${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+     set(FASTLLM_CUDA_SOURCES src/devices/cuda/cudadevice.cpp src/devices/cuda/cudadevicebatch.cpp src/devices/cuda/fastllm-cuda.cu)
+     set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cublas)
+-    set(CMAKE_CUDA_ARCHITECTURES "native")
++    set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89)
+ endif()
+
+ if (PY_API)
+```
+
+
+## Windows
+
+### fastllm.h error
+
+**现象：**
+
+> include\fastllm.h(50): error : identifier "top_k" is undefined  
+> include\fastllm.h(172): error : expected a "}"  
+> include\fastllm.h(234): error : identifier "DataDevice" is undefined  
+> ....
+
+**解决办法：** 使用cmake构建通常不存在这一问题。参考 [example\README.md](/example/README.md)。签出代码后，**修改 include/fastllm.h**，Visual Studio中点击”文件“ -> "高级保存选项"，在编码中选择”Unicode (UTF-8 **带签名**) -代码页 65001“，或在其他文本编辑器中转为”UTF-8 BOM“编码。（由于linux下gcc不识别BOM头，MSVC依赖BOM判断文件编码，该修改只能手动处理。）
+
+### main.exe 无法识别中文输入
+
+**原因：** Windows下cmd不支持UTF-8编码，
+
+**解决办法：** 编译[Win32Demo](/example/README.md#win32demo-windows平台) 或使用 [WebUI](/example/README.md#web-ui)
+
+### 导入提示 FileNotFoundError
+
+**现象：**
+
+> File "...Python\lib\ctypes\_\_init\_\_.py", line 374, in \_\_init\_\_  
+>     self._handle = _dlopen(self._name, mode)  
+> FileNotFoundError: Could not find module 'tools\fastllm_pytools\fastllm_tools.dll' (or one of its dependencies). Try using the full path with constructor syntax.
+
+**解决办法：** 非CPU编译时，部分版本的python存在这一问题。
+
+GPU编译时，根据使用的CUDA版本，将cudart cublas的相关dll文件复制到fastllm_tools同一目录下，例如：
+
+* CUDA 9.2
+  * %CUDA_PATH%\bin\cublas64_92.dll
+  * %CUDA_PATH%\bin\cudart64_92.dll
+* CUDA 11.x 
+  * %CUDA_PATH%\bin\cudart64_110.dll
+  * %CUDA_PATH%\bin\cublas64_11.dll
+  * %CUDA_PATH%\bin\cublasLt64_11.dll
+* CUDA 12.x 
+  * %CUDA_PATH%\bin\cudart64_12.dll
+  * %CUDA_PATH%\bin\cublas64_12.dll
+  * %CUDA_PATH%\bin\cublasLt64_12.dll
+
+## fastllm_pytools
+
+### 释放内存报错： CUDA error when release memory
+
+**现象：**
+退出时报错：
+> Error: CUDA error when release memory!  
+> CUDA error = 4, cudaErrorCudartUnloading at fastllm/src/devices/cuda/fastllm-cuda.cu:1493  
+> 'driver shutting down'
+
+**原因：** python解释器在终止时常常会优先终止自己的进程，而没有现先析构调用的第三方库，因此在退出python时CUDA Runtime已关闭，释放显存操作失败。由于大多数时候显存已释放，并不会引起问题。
+
+**解决办法：** python程序退出时，先显式调用 `llm.release_memory()`方法。
diff --git a/pyfastllm/README.md b/pyfastllm/README.md
@@ -47,18 +47,19 @@ pyfastllm是基于fastllm的python api接口实现，通过pyfastllm可以更加
 
 首先下载pybind11 c++依赖:
 
-```sh
+```shell
 git submodule init 
 git submodule update  # 下载pybind11依赖
 ```
 
 Cpp手动编译：
-```sh
+
+```shell
 mkdir build-py
 cd build-py
 cmake .. -DUSE_CUDA=ON -DPY_API=ON
 make -j
-cp pyfastllm*.so pyfastllm/examples/
+cp fastllm*.so pyfastllm/examples/  # 或放置在$PYTHONPATH环境变量包含的的目录中
 cd ../pyfastllm/examples/
 python3 cli_simple.py -p chatglm-6b-int8.flm  # 与cpp编译的运行结果保持一致
 ```
@@ -69,12 +70,12 @@ python3 cli_simple.py -p chatglm-6b-int8.flm  # 与cpp编译的运行结果保
 
 首先下载pybind11：
 
-```bash
+```shell
 pip install pybind11
 ```
 
 - GPU
-```sh
+```shell
 cd pyfastllm/
 python3 setup.py build
 python3 setup.py install 
@@ -83,25 +84,25 @@ python3 cli_simple.py -p chatglm-6b-int8.flm
 ```
 
 - CPU
-```sh
+```shell
 cd pyfastllm/
 export USE_CUDA=OFF
 python3 setup.py build
 python3 setup.py install 
 cd examples/
 python3 cli_simple.py -p chatglm-6b-int8.flm -t 8
-
 ```
+
 ## 使用
 
 ### python 调用
 在examples文件夹中存放了几种常见的代码示例：
 
-examples/cli_simple.py: 调用api接口示例(推荐)
-examples/cli_low_api.py: 底层API调用示例
-examples/convert_model.py: 模型转换示例
-examples/web_api.py, demo/web_api_client.py: fastapi webapi调用
-examples/test_ops: 部分op的使用样例及测试
+- `examples/cli_simple.py`: 调用api接口示例(推荐)
+- `examples/cli_low_api.py`: 底层API调用示例
+- `examples/convert_model.py`: 模型转换示例
+- `examples/web_api.py`, `examples/web_api_client.py`: fastapi webapi调用
+- `examples/test_ops.py`: 部分op的使用样例及测试
 
 ### 命令行工具
 

diff --git a/pyfastllm/examples/cli_simple.py b/pyfastllm/examples/cli_simple.py
@@ -79,7 +79,7 @@ def run_with_response(args):
         print(f"{model.model_type}:", end=' ')
         past_len = 0
         for output in outputs:
-            print(output[past_len:].strip(), end='', flush=True)
+            print(output[past_len:], end='', flush=True)
             past_len = len(output)
         print()
         model.make_history(history, dialog_round, input_text, output)

diff --git a/src/fastllm.cpp b/src/fastllm.cpp
@@ -34,14 +34,19 @@
 #include "fastllm-cuda.cuh"
 #endif
 
+#ifdef PY_API
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#endif
+
 namespace fastllm {
     std::map <std::string, int> defaultDeviceMap;
     Executor defaultExecutor;
     Executor *curExecutor = &defaultExecutor;
 
     static std::mutex globalLocker;
     static int threads = 4;
-    static ThreadPool *fastllmThreadPool = new ThreadPool(threads);
+    static ThreadPool *fastllmThreadPool = nullptr;
     static bool lowMemMode = false;
     static bool kvCacheInCPU = false;
 
@@ -74,6 +79,9 @@ namespace fastllm {
     }
 
     void SetThreads(int t) {
+#ifdef PY_API
+        py::gil_scoped_release release;
+#endif
         globalLocker.lock();
         threads = t;
         if (fastllmThreadPool != nullptr) {
@@ -82,6 +90,9 @@ namespace fastllm {
         }
         fastllmThreadPool = new ThreadPool(t);
         globalLocker.unlock();
+#ifdef PY_API
+        py::gil_scoped_acquire acquire;
+#endif
     }
 
     void SetLowMemMode(bool m) {
@@ -101,6 +112,8 @@ namespace fastllm {
     }
 
     ThreadPool *GetPool() {
+        if (fastllmThreadPool == nullptr)
+            SetThreads(threads);
         return fastllmThreadPool;
     }
 #ifdef USE_MMAP

diff --git a/tools/scripts/cli_demo.py b/tools/scripts/cli_demo.py
@@ -4,11 +4,15 @@
 def args_parser():
     parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
     parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
+    parser.add_argument('-t', '--threads', type=int, default=4,  help='使用的线程数量')
+    parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
     args = parser.parse_args()
     return args
 
 if __name__ == "__main__":
     args = args_parser()
+    llm.set_cpu_threads(args.threads)
+    llm.set_cpu_low_mem(args.low)
     model = llm.model(args.path)
 
     history = []
@@ -26,4 +30,5 @@ def args_parser():
         for response in model.stream_response(query, history = history):
             curResponse += response;
             print(response, flush = True, end = "")
-        history.append((query, curResponse))
+        history.append((query, curResponse))
+    model.release_memory()