In [3]:
import argparse
import os
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from threading import Thread
from sse_starlette.sse import EventSourceResponse
import torch

In [4]:
torch.cuda.is_available()

True

参数设置

In [5]:
class Args:
    def __init__(self, base_model=None, lora_model=None, tokenizer_path=None, gpus="0",
                 load_in_8bit=False, load_in_4bit=False, only_cpu=False, alpha="1.0"):
        self.base_model = base_model
        self.lora_model = lora_model
        self.tokenizer_path = tokenizer_path
        self.gpus = gpus
        self.load_in_8bit = load_in_8bit
        self.load_in_4bit = load_in_4bit
        self.only_cpu = only_cpu
        self.alpha = alpha

# 在这里设置你的参数
args = Args(
    base_model="../model_datas/chinese-alpaca-2-7b-hf",
    # base_model="/media/vkeilo/game/github_project/model_datas/chinese-alpaca-2-7b-hf",
    lora_model=None,  # 或保持为None，根据你的需要
    tokenizer_path=None,
    gpus="0,1",
    load_in_8bit=False,
    load_in_4bit=False,
    only_cpu=True,
    alpha="1.0"
)

In [6]:
if args.only_cpu is True:
    args.gpus = ""
    if args.load_in_8bit or args.load_in_4bit:
        raise ValueError("Quantization is unavailable on CPU.")
if args.load_in_8bit and args.load_in_4bit:
    raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

import torch
import torch.nn.functional as F
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    GenerationConfig,
    TextIteratorStreamer,
    BitsAndBytesConfig
)
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# openai_api_server.py 的父目录
sys.path.append('/media/vkeilo/game/github_project/Chinese-LLaMA-Alpaca-2/scripts')
sys.path.append('/media/vkeilo/game/github_project/Chinese-LLaMA-Alpaca-2/scripts/openai_server_demo')

In [7]:
sys.path.append('../Chinese-LLaMA-Alpaca-2/scripts')
sys.path.append('../Chinese-LLaMA-Alpaca-2/scripts/openai_server_demo')

In [8]:
from attn_and_long_ctx_patches import apply_attention_patch, apply_ntk_scaling_patch

apply_attention_patch(use_memory_efficient_attention=True)
apply_ntk_scaling_patch(args.alpha)

from openai_api_protocol import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatMessage,
    ChatCompletionResponseChoice,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    EmbeddingsRequest,
    EmbeddingsResponse,
    ChatCompletionResponseStreamChoice,
    DeltaMessage,
)

load_type = torch.float16
if torch.cuda.is_available():
    device = torch.device(0)
else:
    device = torch.device("cpu")
if args.tokenizer_path is None:
    args.tokenizer_path = args.lora_model
    if args.lora_model is None:
        args.tokenizer_path = args.base_model
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)
if args.load_in_4bit or args.load_in_8bit:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=args.load_in_4bit,
        load_in_8bit=args.load_in_8bit,
        bnb_4bit_compute_dtype=load_type,
    )
base_model = LlamaForCausalLM.from_pretrained(
    args.base_model,
    torch_dtype=load_type,
    low_cpu_mem_usage=True,
    device_map='auto' if not args.only_cpu else None,
    load_in_4bit=args.load_in_4bit,
    load_in_8bit=args.load_in_8bit,
    quantization_config=quantization_config if (args.load_in_4bit or args.load_in_8bit) else None
    # vkeilo add it
    # output_hidden_states=True
)

Xformers is not installed correctly. If you want to use memory_efficient_attention use the following command to install Xformers
pip install xformers.
USE_MEM_EFF_ATTENTION:  False
STORE_KV_BEFORE_ROPE: False
Apply NTK scaling with ALPHA=1.0
The value of scaling factor will be read from model config file, or set to 1.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.27s/it]


#### 查看base模型架构

In [9]:
print(base_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(55296, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [10]:
e_matrix = base_model.model.embed_tokens.weight.data
u_matrix = base_model.lm_head.weight.data

In [11]:
u_matrix.t().shape

torch.Size([4096, 55296])

In [10]:
# 以k矩阵为例，所取出的128*4096对应 k向量维度*hidden_size 与attention论文中矩阵中的k矩阵为转置关系
# o矩阵不同，所取出的为 hidden_size*（32*128）的矩阵
k_matrixes = torch.empty(0)
q_matrixes = torch.empty(0)
v_matrixes = torch.empty(0)
o_matrixes = torch.empty(0)
for layer in base_model.model.layers:
    tmp_k_matrixes = layer.self_attn.k_proj.weight.data
    tmp_q_matrixes = layer.self_attn.q_proj.weight.data
    tmp_v_matrixes = layer.self_attn.v_proj.weight.data
    tmp_o_matrixes = layer.self_attn.o_proj.weight.data
    tmp_k_matrixes = tmp_k_matrixes.reshape(1,32,128,4096)
    tmp_q_matrixes = tmp_q_matrixes.reshape(1,32,128,4096)
    tmp_v_matrixes = tmp_v_matrixes.reshape(1,32,128,4096)
    tmp_o_matrixes = tmp_o_matrixes.reshape(1,4096,32,128).permute(0, 2, 1, 3)
    k_matrixes = torch.cat((k_matrixes, tmp_k_matrixes), dim=0)
    q_matrixes = torch.cat((q_matrixes, tmp_q_matrixes), dim=0)
    v_matrixes = torch.cat((v_matrixes, tmp_v_matrixes), dim=0)
    o_matrixes = torch.cat((o_matrixes, tmp_o_matrixes), dim=0)



注意，在论文《Attention is all you need》中，$score = qk^T$的前提是$q_1 = x_1W_Q$,
而在论文《A Mathematical Framework for Transformer Circuits》中，$w_{QK}=W_Q^TW_K$是因为$k_i=W_Kx_i$
两篇论文的k,q,v矩阵互为转置关系，代码以后者为基准。
W_Q矩阵的行数为128（中间向量维度），列数为4096（hidden_size）

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_cpu = torch.device('cpu')

In [12]:
device

device(type='cuda')

qk矩阵的特征值计算

In [16]:
import torch
import numpy as np
from concurrent.futures import ThreadPoolExecutor
# 导入clear_output
# from IPython.display import clear_output
# 定义处理函数
def process_attention_head(args):
    layer_idx, head_idx, q_matrixes, k_matrixes, attention_heads_list = args
    # 提取对应层和头的q和k矩阵
    q_matrix = q_matrixes[layer_idx, head_idx, :, :].reshape(128, 4096).to(device)
    k_matrix = k_matrixes[layer_idx, head_idx, :, :].reshape(128, 4096).to(device)

    # 计算q^Tk，注意这里的q和k都是已经reshape过的，所以需要转置k来进行矩阵乘法
    q_k_product = torch.matmul(q_matrix.t(), k_matrix)

    # 计算特征值和特征向量
    eigenvalues, eigenvectors = torch.linalg.eig(q_k_product)
    print(f'layer{layer_idx} head {head_idx} processed')
    attention_heads_list[head_idx] = {
        'eigenvalues': eigenvalues.to('cpu').detach().numpy(),
        'eigenvectors': eigenvectors.permute(1,0).to('cpu').detach().numpy()
    }
    return 

def run_qk(layer_idx,attention_heads_list_one_layer):
    count = 0
    # 创建 ThreadPoolExecutor，并指定线程数
    with ThreadPoolExecutor(max_workers=4) as executor:
        # 提交任务到线程池
        futures = []
        for head_idx in range(32):
            args = (layer_idx, head_idx, q_matrixes, k_matrixes, attention_heads_list_one_layer)
            future = executor.submit(process_attention_head, args)
            futures.append(future)
        
        # 等待所有任务完成
        for future in futures:
            future.result()
            count += 1
            # clear_output(wait=True)
            print(f"Processed {count} /1024 tasks")


# 在此之前需要定义q_matrixes, k_matrixes 和 device
# 构建32*32的列表，对应32层下的32个注意力头
for layer_idx in range(0,5):
    attention_heads_list = np.array([{} for _ in range(32)])
    run_qk(layer_idx, attention_heads_list)
    with open(f'datas/QK_datas/QK_arraylayer_{layer_idx}.npy', 'wb') as f:
        np.save(f, attention_heads_list)
    del attention_heads_list
    # attention_heads_list = np.array([[{} for _ in range(32)] for _ in range(32)])
# # 运行函数
# run_qk()
# # 保存array变量attention_heads_list到文件中
# with open('datas/QK_array.npy', 'wb') as f:
#     np.save(f, attention_heads_list)


layer0 head 3 processed
layer0 head 0 processed
layer0 head 2 processed
layer0 head 1 processed
Processed 1 /1024 tasks
Processed 2 /1024 tasks
Processed 3 /1024 tasks
Processed 4 /1024 tasks
layer0 head 4 processed
Processed 5 /1024 tasks
layer0 head 6 processed
layer0 head 7 processed
layer0 head 5 processed
Processed 6 /1024 tasks
Processed 7 /1024 tasks
Processed 8 /1024 tasks
layer0 head 8 processed
Processed 9 /1024 tasks
layer0 head 9 processed
Processed 10 /1024 tasks
layer0 head 10 processed
Processed 11 /1024 tasks
layer0 head 11 processed
Processed 12 /1024 tasks
layer0 head 12 processed
Processed 13 /1024 tasks
layer0 head 13 processed
Processed 14 /1024 tasks
layer0 head 15 processed
layer0 head 14 processed
Processed 15 /1024 tasks
Processed 16 /1024 tasks
layer0 head 16 processed
Processed 17 /1024 tasks
layer0 head 19 processed
layer0 head 17 processed
Processed 18 /1024 tasks
layer0 head 18 processed
Processed 19 /1024 tasks
Processed 20 /1024 tasks
layer0 head 20 proc

ov矩阵的特征值计算

In [15]:
import torch
import numpy as np
from concurrent.futures import ThreadPoolExecutor
def process_attention_head_ov(args):
    layer_idx, head_idx, o_matrixes, v_matrixes, attention_heads_list_ov = args
    # 提取对应层和头的q和k矩阵
    o_matrix = o_matrixes[layer_idx, head_idx, :, :].reshape(128, 4096).to(device)
    v_matrix = v_matrixes[layer_idx, head_idx, :, :].reshape(4096, 128).to(device)
    
    # 计算O*V，注意这里的q和k都是已经reshape过的，所以需要转置k来进行矩阵乘法
    eovu_product = torch.matmul(e_matrix.float().to(device), torch.matmul(torch.matmul(o_matrix.t(), v_matrix.t()), u_matrix.float().to(device).t()))
    # eovu_product = torch.matmul(o_matrix.t(), v_matrix.t())

    

    # 计算特征值和特征向量
    eigenvalues = torch.linalg.eigvals(eovu_product.float())
    print(f'layer{layer_idx} head {head_idx} processed')
    attention_heads_list_ov[head_idx] = {
        'eigenvalues': eigenvalues.to('cpu').detach().numpy(),
    }
    return

def run_ov(layer_idx,attention_heads_list_one_layer):
    count = 0
    # 创建 ThreadPoolExecutor，并指定线程数
    with ThreadPoolExecutor(max_workers=4) as executor:
        # 提交任务到线程池
        futures = []
        for head_idx in range(32):
            args = (layer_idx, head_idx, o_matrixes, v_matrixes, attention_heads_list_one_layer)
            future = executor.submit(process_attention_head_ov, args)
            futures.append(future)
        
        # 等待所有任务完成
        for future in futures:
            future.result()
            count += 1
            # clear_output(wait=True)
            print(f"Processed {count} /1024 tasks")


# 在此之前需要定义q_matrixes, k_matrixes 和 device
# 构建32*32的列表，对应32层下的32个注意力头
for layer_idx in range(1):
    attention_heads_list_ov = np.array([{} for _ in range(32)])
    run_ov(layer_idx, attention_heads_list_ov)
    with open(f'datas/OV_datas/OV_arraylayer_{layer_idx}.npy', 'wb') as f:
        np.save(f, attention_heads_list_ov)
    del attention_heads_list_ov

RuntimeError: The expanded size of the tensor (4096) must match the existing size (55296) at non-singleton dimension 1.  Target sizes: [128, 4096].  Tensor sizes: [128, 55296]

In [None]:
# 保存array变量attention_heads_list到文件中
with open('datas/QK_array.npy', 'wb') as f:
    np.save(f, attention_heads_list)


In [15]:
tmp_q_matrix = q_matrixes[0, 0, :, :].reshape(128, 4096)
tmp_k_matrix = k_matrixes[0, 0, :, :].reshape(128, 4096)
tq = tmp_q_matrix.clone().detach()
tk = tmp_q_matrix.clone().detach()

In [49]:
mat = torch.matmul(tq.t(),tk)

In [50]:
ta,tb = torch.linalg.eig(mat)

In [53]:
ta.shape

torch.Size([4096])

In [48]:
# 取第一个特征值和特征向量
first_eigenvalue = ta[:, 0]
first_eigenvector = tb[:, 0]

# 验证 Ax = λx
Ax = torch.matmul(mat, first_eigenvector)
Ax_over_lambda = Ax / first_eigenvalue

# 打印验证结果
print("验证结果:", torch.allclose(Ax_over_lambda, first_eigenvector))

# 打印特征值和特征向量
print("第一个特征值:", first_eigenvalue)
print("第一个特征向量:", first_eigenvector)

IndexError: too many indices for tensor of dimension 1

In [59]:
tb[:,0].shape

torch.Size([4096])

In [54]:
lx = ta[0]*tb[:,0]

In [60]:
tb[:,0]

tensor([-0.0082+0.j, -0.0201+0.j,  0.0175+0.j,  ...,  0.0108+0.j,  0.0203+0.j,
        -0.0167+0.j])

In [61]:
lx

tensor([-14.1930+0.j, -34.7486+0.j,  30.3795+0.j,  ...,  18.7687+0.j,  35.1911+0.j,
        -28.8638+0.j])

In [63]:
ax

tensor([-14.1924, -34.7503,  30.3795,  ...,  18.7687,  35.1912, -28.8638])

In [62]:
ax = torch.matmul(mat,tb[:,0].real)

In [42]:
print(mat.shape)
print(tb[0].shape)

torch.Size([4096, 4096])
torch.Size([4096])


In [40]:
tb[0].shape

torch.Size([4096])