<a href="https://colab.research.google.com/github/uppercaveman/vllm-learn/blob/main/vllm_llama3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 检查Python环境

In [1]:
!pip --version

pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)


In [2]:
!pip3 --version

pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)


# 查看GPU信息

In [3]:
# 选择并连接T4服务器
# nvidia-smi 是 NVIDIA 提供的一个命令行工具，用于监控和管理 NVIDIA GPU 设备
!nvidia-smi

Thu Apr 25 05:41:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# 更多内存信息
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install -q gputil

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    virtual_memory = humanize.naturalsize( psutil.virtual_memory().available )
    memory_info = humanize.naturalsize( process.memory_info().rss)
    memoryFree =gpu.memoryFree
    memoryUsed = gpu.memoryUsed
    memoryUtil = gpu.memoryUtil*100
    memoryTotal = gpu.memoryTotal

    print(f"Gen RAM Free: {virtual_memory} | Proc size: {memory_info}" )

    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".\
          format(memoryFree, memoryUsed, memoryUtil*100, memoryTotal))
printm()

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
Gen RAM Free: 12.5 GB | Proc size: 99.7 MB
GPU RAM Free: 15101MB | Used: 0MB | Util   0% | Total 15360MB


In [5]:
# 查看cuda版本信息
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


# 安装vllm

In [6]:
# 源码安装，下载代码
!git clone https://github.com/vllm-project/vllm.git vllmsrc

Cloning into 'vllmsrc'...
remote: Enumerating objects: 15789, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 15789 (delta 51), reused 61 (delta 21), pack-reused 15637[K
Receiving objects: 100% (15789/15789), 8.04 MiB | 21.95 MiB/s, done.
Resolving deltas: 100% (11809/11809), done.


In [7]:
# 安装vllm
!cd vllmsrc && pip3 install -e .

Obtaining file:///content/vllmsrc
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ninja (from vllm==0.4.1+cu122)
  Using cached ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
Collecting fastapi (from vllm==0.4.1+cu122)
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard] (from vllm==0.4.1+cu122)
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken==0.6.0 (from vllm==0.4.1+cu122)
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manyl

In [8]:
# 检查vllm安装情况
!pip3 show vllm

Name: vllm
Version: 0.4.1+cu122
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
Home-page: https://github.com/vllm-project/vllm
Author: vLLM Team
Author-email: 
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Editable project location: /content/vllmsrc
Requires: cmake, fastapi, filelock, lm-format-enforcer, ninja, numpy, nvidia-ml-py, outlines, prometheus-client, psutil, py-cpuinfo, pydantic, ray, requests, sentencepiece, tiktoken, tokenizers, torch, transformers, typing-extensions, uvicorn, vllm-nccl-cu12, xformers
Required-by: 


In [9]:
# 检查torch版本
import torch
print(torch.version.cuda)

12.1


# 测试

In [11]:
import vllm
from vllm import LLM, SamplingParams

ModuleNotFoundError: No module named 'vllm'

In [None]:
# 定义输入提示列表和生成的采样参数
prompts = [
    "你好，我的名字是",
    "中国首都是",
    "北京水库是",
    "AI的未来是",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

ImportError: cannot import name 'LLM' from 'vllm' (unknown location)

In [None]:
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")

# 排查 Import "vllm" could not be resolved 问题

In [None]:
!pip install vllm



In [None]:
!echo $PYTHONPATH

/env/python


In [None]:
# 设置环境变量
# %env PYTHONPATH=/env/python:/usr/local/lib/python3.10/dist-packages
%env PYTHONPATH=/env/python
!echo $PYTHONPATH

env: PYTHONPATH=/env/python
/env/python


In [None]:
!whereis python

python: /usr/local/bin/python


In [None]:
!python --version

Python 3.10.12


In [None]:
!whereis pip

pip: /usr/local/bin/pip


In [None]:
!pip --version

In [None]:
!mv vllm vllmsrc