In [1]:
lines = []
with open('data/all_text.txt', 'r', encoding='utf-8') as file:
    for line in file:
        lines.append(line.strip())

In [2]:
import re
def remove_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    # 使用 re.sub() 函数替换所有匹配的 URL 为 ""
    text_without_urls = re.sub(url_pattern, '', text)
    specific_text_pattern = re.compile(r'扫描下方二维码关注公众号|提取码|关注|科学上网|回复关键词|侵权|版权|致谢|引用|LICENSE'
                                   r'|组队打卡|任务打卡|组队学习的那些事|学习周期|开源内容|打卡|组队学习|链接')
    text_without_urls = re.sub(specific_text_pattern, '', text)
    return text_without_urls
from langchain.schema import Document
docs = []
for idx, line in enumerate(lines):
    line = remove_urls(line.strip("\n").strip())
    words = line.split("\t")
    docs.append(Document(page_content=words[0], metadata={"id": idx, "source": "all_text.txt"}))

In [None]:
docs

In [4]:
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

top_k = 20
import torch

# 检查 CUDA 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = '/mnt/workspace/.cache/modelscope/hub/maidalun/bce-embedding-base_v1'
model_kwargs = {'device': device}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [None]:
vectorstore = Chroma.from_documents(documents=docs[:], embedding=hf, persist_directory="db_car_chroma_bce")

In [None]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": top_k}
)
question = "怎样加热座椅？"
retri_re = retriever.invoke(question)

In [None]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(docs, hf)

In [None]:
db.save_local("db_car_faiss_bce")

new_db = FAISS.load_local("faiss_index", hf)

docs = new_db.similarity_search("怎样加热座椅？")

In [6]:
from langchain_community.chat_models import ChatTongyi
from dotenv import load_dotenv
import os
load_dotenv()
llm = ChatTongyi(
    model="qwen-max-0428",
    # top_p="...",
    api_key=os.getenv("api_key"),
    # other params...
)

In [7]:
messages = [
    ("system", "你是一名专业的翻译家，可以将用户的中文翻译为英文。"),
    ("human", "我喜欢编程3。"),
]
llm.invoke(messages)

AIMessage(content='I like programming.', response_metadata={'model_name': 'qwen-max-0428', 'finish_reason': 'stop', 'request_id': 'f903fe31-3a5f-9ff8-acfc-1406d4fb40ce', 'token_usage': {'input_tokens': 31, 'output_tokens': 4, 'total_tokens': 35}}, id='run-8692bf5d-22f4-4250-8f2c-48658ad755ec-0')

In [8]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import render_text_description
system_prompt =  f"""\
You are an assistant that has access to the following set of tools. Here are the names and descriptions for each tool:

Given the user input, return the name and input of the tool to use. Return your response as a JSON blob with 'name' and 'arguments' keys.

The `arguments` should be a dictionary, with keys corresponding to the argument names and the values corresponding to the requested values.
"""

prompt = ChatPromptTemplate.from_messages(
 [("system", system_prompt),  ("user",  "{input}")])

In [9]:
chain = prompt | llm
message = chain.invoke({"input": "4的平方是多少？"})

# Let's take a look at the output from the model
# if the model is an LLM (not a chat model), the output will be a string.
if isinstance(message, str):
    print(message)
else:  # Otherwise it's a chat model
    print(message.content)

```json
{
  "name": "calculate_square",
  "arguments": {
    "number": 4
  }
}
```
