#  大模型部署 

- Xinference
- Ollama
- vllm https://github.com/owenliang/qwen-vllm

# Ollama 


```bash
curl -fsSL https://ollama.com/install.sh | sh

or
 
git cloen git@github.com:ollama/ollama.git
install ollama /usr/local/bin/ollama

pip install -r requirements.txt

OLLAMA_NUM_PARALLEL=16 OLLAMA_MODELS=/root/workspace/models ollama serve

ollama pull xxx
```

编写一个 MODELFILE 
https://github.com/ollama/ollama/blob/main/docs/modelfile.md




```bash
git clone https://github.com/ggerganov/llama.cpp.git & llama.cpp
make -j
```

```text
FROM PATH_TO_YOUR__GGUF_MODEL

# set the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1

# set the system message
SYSTEM """
You are Mario from Super Mario Bros. Answer as Mario, the assistant, only.
"""
```

`ollama create NAME -f ./Modelfile`

NAME: 在ollama中显示的名称
/Modelfile: 绝对或者相对路径


`ollama run NAME`

# llama.cpp

https://github.com/echonoshy/cgft-llm/blob/master/llama-cpp/README.md

```bash
git clone git@github.com:ggerganov/llama.cpp.git
cd llama.cpp
pip install -r requirements.txt
```

```bash
python convert-hf-to-gguf.py /root/autodl-tmp/models/Llama3-8B-Chinese-Chat --outfile /root/autodl-tmp/models/Llama3-8B-Chinese-Chat-GGUF/Llama3-8B-Chinese-Chat-q8_0-v1.gguf --outtype q8_0

cd ~/code/llama.cpp/build_cuda/bin
./quantize --allow-requantize /root/autodl-tmp/models/Llama3-8B-Chinese-Chat-GGUF/Llama3-8B-Chinese-Chat-q8_0-v2_1.gguf /root/autodl-tmp/models/Llama3-8B-Chinese-Chat-GGUF/Llama3-8B-Chinese-Chat-q4_1-v1.gguf Q4_1
```

# vLLM 

https://github.com/vllm-project/vllm

https://docs.vllm.ai/en/stable/

```bash
pip -U install vllm 

bash vllm_server.sh

python -m vllm.entrypoints.openai.api_server \
    --model /root/autodl-tmp/LLM-Research/Meta-Llama-3-8B-Instruct \
    --served-model-name llama3:8b-instruct-fp16 \
    --trust-remote-code \
    --max-model-len 4096 \
    --port 11434
```

# Xinference 


https://github.com/xorbitsai/inference

https://inference.readthedocs.io/zh-cn/latest/index.html

https://inference.readthedocs.io/zh-cn/latest/getting_started/environments.html

[FastGPT + Xinference：一站式本地 LLM 私有化部署和应用开发](https://zhuanlan.zhihu.com/p/677208959)


```bash
conda create --name xinference python=3.11
conda activate xinference
```

```bash
pip install "xinference[all]" -i http://mirrors.aliyun.com/pypi/simple

export XINFERENCE_ENDPOINT
export XINFERENCE_HOME = /path/to/xinference
export XINFERENCE_MODEL_SRC=modelscope
export HUGGING_FACE_HUB_TOKEN = 
xinference-local -H 0.0.0.0 # 0.0.0.0:9997
```

```bash
XINFERENCE_TRANSFORMERS_ENABLE_BATCHING=1 XINFERENCE_MODEL_SRC=modelscope XINFERENCE_HOME=/root/autodl-tmp/xinference_data xinference-local -H 0.0.0.0 --port 3000 --log-level debug
```



```bash
docker run -e XINFERENCE_MODEL_SRC=modelscope -p 9998:9997 --gpus all xprobe/xinference:v<your_version> xinference-local -H 0.0.0.0 --log-level debug
```

```bash
XINFERENCE_MODEL_SRC=modelscope inference-local -H 0.0.0.0:6006
```

分布式部署

```bash 
xinference-supervisor -H "${supervisor_host}" # Master Server 
xinference-worker -H "${worker_host}" --supervisor-address "${supervisor_host}:9997" # Slave Server 
```

# 内网穿透

## [Cloudflare Tunnel](https://www.cloudflare.com/zh-cn/products/tunnel/)


https://developers.cloudflare.com/cloudflare-one/ 没用的文档

https://one.dash.cloudflare.com/338bee983ce1c7bfefa380fd61b71069/networks/tunnels?search= 自己创建一个 Access Tunnel



```bash
docker run -d --name cloudflared --restart unless-stopped cloudflare/cloudflared:latest tunnel --no-autoupdate run --token <YOUR_TUNNEL_TOKEN>
```

```bash
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
!dpkg -i cloudflared-linux-amd64.deb
cloudflared tunnel login
cloudflared tunnel create <TUNNEL_NAME>
cloudflared tunnel route dns <TUNNEL_NAME> <YOUR_DOMAIN>
cloudflared tunnel run <TUNNEL_NAME>
```


```text
tunnel: <TUNNEL_UUID>
credentials-file: /root/.cloudflared/<TUNNEL_UUID>.json
ingress:
  - hostname: <YOUR_DOMAIN>
    service: http://localhost:80

```


```bash
cloudflared --config ~/.cloudflared/config.yml tunnel run
```

https://sspai.com/post/79278

In [None]:
# cloudflare 


import subprocess
import threading
import time
import socket
import urllib.request

def iframe_thread(port):
  while True:
      time.sleep(0.5)
      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
      result = sock.connect_ex(('127.0.0.1', port))
      if result == 0:
        break
      sock.close()
  print("\nOmniPrase API finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\n")

  p = subprocess.Popen(["cloudflared", "tunnel", "--url", "http://127.0.0.1:{}".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  for line in p.stderr:
    l = line.decode()
    if "trycloudflare.com " in l:
      print("This is the URL to access OmniPrase:", l[l.find("http"):], end='')
    #print(l, end='')


threading.Thread(target=iframe_thread, daemon=True, args=(8000,)).start()

!python server.py --host 127.0.0.1 --port 8000 --documents --media --web

## 防止关机

```bash
#!/bin/bash

# GPU使用率的阈值
THRESHOLD=5

# 间隔时间（秒）
INTERVAL=600

# Python脚本的路径
PYTHON_SCRIPT="gpu_press.py"

# 日志文件的路径
LOG_FILE="scedule-task.log"

# 增加一个counter变量用于跟踪Python脚本运行的次数
counter=0

while true; do
    # 使用nvidia-smi命令获取GPU利用率，这里假设只有一个GPU
    GPU_USAGE=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits)

    # 获取当前日期和时间
    current_datetime=$(date '+%Y-%m-%d %H:%M:%S')

    # 检查GPU使用率是否小于我们设定的阈值
    if [ $(echo "$GPU_USAGE < $THRESHOLD" | bc) -eq 1 ]; then
        # 是否小于阈值，如果是，运行python脚本
        python $PYTHON_SCRIPT
        
        # 程序运行后，增加counter
        ((counter++))
        
        # 将当前日期，时间和计数器值记录到日志文件
        echo "$current_datetime - Python script has been run $counter times." | tee -a $LOG_FILE
        
        # 等待指定的间隔时间
        sleep $INTERVAL
    else
        # 如果GPU使用率没有低于阈值，睡一会儿再检查
        sleep 60
    fi
done

```

gpu_press.py
```python
import torch

# Setup: 增加矩阵的大小以使用更多的GPU内存和计算能力
# 注意：这里设置的矩阵非常大，请根据GPU的内存能力进行调整
m = k = n = 8192
dtype = torch.float32

# 将张量“a”和“b”初始化为随机值而非零值，增加计算复杂性
a = torch.randn(m, k, dtype=dtype, device='cuda:0')
b = torch.randn(k, n, dtype=dtype, device='cuda:0')

# 重复执行多次矩阵乘法以增加持续负载
for _ in range(1000):
    # 使用in-place操作来增加计算的复杂性
    a.add_(b.matmul(b.transpose(0, 1)))

    # 执行激活函数或其他复杂操作来进一步增加GPU占用
    torch.sigmoid_(a)

# 确保所有的CUDA操作在继续之前都已完成
torch.cuda.synchronize('cuda:0')

```

`nohup /path/to/your_script.sh > /path/to/log.out 2>&1 &`

## 暴露多个服务

`wget https://autodl-public.ks3-cn-beijing.ksyuncs.com/tool/api-proxy/proxy_in_instance`


config.yaml
```yaml
proxies:
   - host_and_port: http://127.0.0.1:2000  # 要代理到的服务地址
     route_path: /v1/*                     # 匹配什么路由就转发到此地址

   - host_and_port: http://127.0.0.1:3000  # 可设置多组，转发到不同的host
     route_path: /v2/*

```

`chmod +x proxy_in_instance`

`./proxy_in_instance`

```python
from fastapi import FastAPI
import httpx

app = FastAPI()

@app.get("/{path:path}")
async def forward_request(path: str):
    async with httpx.AsyncClient() as client:
        response = await client.get(f"http://localhost:8000/{path}")
        return response.json()

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=YOUR_PROXY_PORT)  # YOUR_PROXY_PORT替换为你想让代理服务监听的端口

```

`pip install fastapi uvicorn httpx`

