[https://huggingface.co/BAAI/bge-large-zh-v1.5](https://huggingface.co/BAAI/bge-large-zh-v1.5)

# 下载模型

获取 hugging face access token：[https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)

In [1]:
import os
from dotenv import find_dotenv, load_dotenv

# 加载环境变量
load_dotenv(find_dotenv())
HUGGING_FACE_ACCESS_TOKEN = os.getenv('HUGGING_FACE_ACCESS_TOKEN')

In [2]:
from huggingface_hub import snapshot_download

repo_id = "BAAI/bge-large-zh-v1.5"  # 模型在huggingface上的名称
local_dir = f"{repo_id.split('/')[0]}/{repo_id.split('/')[1]}"  # 本地模型存储的地址
local_dir_use_symlinks = False  # 本地模型使用文件保存，而非blob形式保存

# 注意：在hugging face上生成的自己的 access token，否则模型下载会中断
token = HUGGING_FACE_ACCESS_TOKEN

# 开始下载
snapshot_download(
    repo_id=repo_id,
    local_dir=local_dir,
    local_dir_use_symlinks=local_dir_use_symlinks,
    token=token,
)

  from .autonotebook import tqdm as notebook_tqdm


'C:\\Users\\92047\\.cache\\huggingface\\hub\\models--BAAI--bge-large-zh-v1.5\\snapshots\\c11661ba3f9407eeb473765838eb4437e0f015c0'

# 测试模型

## Usage for Embedding Model

### Using Sentence-Transformers

In [3]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.85533345 0.85206324]
 [0.874563   0.85579395]]


### Using Langchain

In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-zh-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    query_instruction="为这个句子生成表示以用于检索相关文章："
)
model.query_instruction = "为这个句子生成表示以用于检索相关文章："

model.embed_query("样例数据-1")

[-0.002105984603986144,
 0.017445463687181473,
 -0.021290790289640427,
 0.02465277723968029,
 0.033632542937994,
 0.004217152949422598,
 0.047564711421728134,
 -0.06113917753100395,
 -0.027129819616675377,
 -0.022905241698026657,
 -0.031763479113578796,
 0.00768717285245657,
 -0.011819642037153244,
 -0.021503319963812828,
 -0.02679901197552681,
 -0.01015820074826479,
 -0.009691908024251461,
 0.005693728569895029,
 0.02409503422677517,
 0.011491154320538044,
 0.057180240750312805,
 0.002730622189119458,
 -0.05199157074093819,
 -0.03349972516298294,
 0.021234067156910896,
 -0.023961767554283142,
 -0.059967655688524246,
 -0.006027083843946457,
 0.04831854626536369,
 -0.05034565180540085,
 -0.025010764598846436,
 0.021168118342757225,
 -0.02665308117866516,
 0.020554307848215103,
 -0.018492521718144417,
 0.014780467376112938,
 -0.027281595394015312,
 -0.05108494311571121,
 -0.02139364928007126,
 0.005647552665323019,
 0.014192860573530197,
 0.05098239704966545,
 0.02886533923447132,
 0.025

### Using Huggingface transformers

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for
sentences = ["样例数据-1", "样例数据-2"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence embeddings: tensor([[ 0.0015,  0.0165, -0.0281,  ..., -0.0309,  0.0297, -0.0327],
        [ 0.0151,  0.0041, -0.0157,  ..., -0.0281,  0.0408, -0.0251]])
