In [1]:
import os

In [3]:
from langchain.document_loaders import DirectoryLoader

# 加载 DOCS 文档
docs_loader = DirectoryLoader('./apitable_devbook/docs', glob="**/*.md", use_multithreading=True, show_progress=True)

docs = docs_loader.load()

## 删除 DOCS 空文档
for index in [ index for index in range(0, len(docs)) if docs[index].page_content == "" ]:
    del docs[index]

for index in range(0, len(docs)):
    docs[index].metadata['source_file'] =  docs[index].metadata['source']
    # 提取出文件名称，例如从 apitable_devbook/current/list-the-team-members.md 提取出 list-the-team-members
    filename = os.path.splitext(os.path.basename(docs[index].metadata['source']))[0]
    docs[index].metadata['source'] = 'https://developers.apitable.com/api/' + filename

# 加载 API refrence 文档
reference_loader = DirectoryLoader('./apitable_devbook/reference', glob="**/*.md", use_multithreading=True, show_progress=True)
reference = reference_loader.load()

## 删除 API refrence 空文档
for index in [ index for index in range(0, len(reference)) if reference[index].page_content == "" ]:
    del reference[index]

for index in range(0, len(reference)):
    reference[index].metadata['source_file'] =  reference[index].metadata['source']
    reference[index].metadata['source'] = 'https://developers.apitable.com/api/reference'

# 将 DOCS 文档和 API refrence 文档合并
documents = docs + reference
len(documents)

100%|██████████| 34/34 [00:01<00:00, 18.00it/s]
100%|██████████| 11/11 [00:00<00:00, 12.36it/s]


45

In [5]:
# 这一步是为了将文档分割成小块，以便于后续的向量化，这主要是是因为 OpenAI 的向量化服务有长度限制
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embed_docs = text_splitter.split_documents(documents)
len(embed_docs)

Created a chunk of size 2226, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 7906, which is longer than the specified 1000
Created a chunk of size 1302, which is longer than the specified 1000
Created a chunk of size 2437, which is longer than the specified 1000
Created a chunk of size 2149, which is longer than the specified 1000
Created a chunk of size 1514, which is longer than the specified 1000
Created a chunk of size 1619, which is longer than the specified 1000
Created a chunk of size 1831, which is longer than the specified 1000
Created a chunk of size 1426, which is longer than the specified 1000
Created a chunk of size 1102, which is longer than the specified 1000


169

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# 向量化
embeddings = OpenAIEmbeddings()
persist_directory = 'db'
vectordb = Chroma.from_documents(embed_docs, embeddings, persist_directory=persist_directory)

# 持久化向量化结果到本地
vectordb.persist()
vectordb = None

Using embedded DuckDB with persistence: data will be stored in: db


In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
persist_directory = 'db'
embeddings = OpenAIEmbeddings()

# 从本地加载向量化结果
docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

qa = RetrievalQAWithSourcesChain.from_chain_type(ChatOpenAI(temperature=0), chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 3}))

Using embedded DuckDB with persistence: data will be stored in: db


In [9]:
# score 是代表向量计算两个内容的距离，越低越好
query = "我怎么获取表格中的记录？"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='```js\nimport { APITable } from \'apitable\';\nconst apitable = new APITable({\n  token: \'Your API Token\',\n});\nconst datasheet = apitable.datasheet("dstWUHwzTHd2YQaXEE");\n// Get records in pages, return to the first page by default\ndatasheet.records.query({ viewId: "viw4mnkqkaqdh"}).then(response => {\nif (response.success) {\n  console.log(response.data.records);\n} else {\n  console.error(response);\n}\n});\n// Automatically handles paging and iteratively returns all records.\nconst recordsIter = datasheet.records.queryAll({ viewId: "viw4mnkqkaqdh"})\n// for await needs to be run in an async function and requires a browser/node version.See https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Statements/for-await... .of\nfor await (const eachPageRecords of recordsIter){\n  console.log(eachPageRecords)\n}\n```\nThe query and queryAll methods support passing in multiple parameters to customize the set of records returned.The supported paramete

In [10]:
qa({"question": "我怎么获取表格中的记录？"}, return_only_outputs=True)

{'answer': 'To get records from the table, you can use the `query` or `queryAll` methods with parameters to customize the set of records returned. The supported parameters are consistent with the query parameters in API Reference "Get Records". To update records, you can use the `update` method with the record ID and fields to be updated. You need to download and initialize the Javascript or Python SDK first before executing the commands. \n',
 'sources': 'https://developers.apitable.com/api/get-records, https://developers.apitable.com/api/update-records'}

In [11]:
# 英文提问在英文语料下的表现会更好
query = "How can I get the records from a table?"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='title: Get Records\n\nmdx-code-block\nimport Tabs from \'@theme/Tabs\';\nimport TabItem from \'@theme/TabItem\';\n\nThis text provides an example of how to call Get records interface.\n\nExample 1: Get all records in the specified view under the specified APITable\n\nSuppose you have a datasheet, and you want to get all the records in the "Grid View".\n\nYour action steps below:\n\nGet your API Token.(How to get it)\n\nGet the dastasheet ID(How to get it) and \'Grid view\' ID(How to get it).\n\nOpen the terminal on your computer, execute the following code and send the query request to the server (assuming datasheetId is dstWUHwzTH2YQaXEE, viewId is viw4mnkqkaqdh):\n````mdx-code-block\n<Tabs\ngroupId="get records"\ndefaultValue="cURL"\nvalues={\n    [\n    { label: "cURL", value: \'cURL\', },\n    { label: "Javascript SDK", value: \'Javascript SDK\', },\n    { label: "Python SDK", value: \'Python SDK\', },\n    ]\n}', metadata={'source': 'https://developers.api

In [12]:
qa({"question": "How can I get the records from a table?"}, return_only_outputs=True)

{'answer': 'To get the records from a table, you need to use the Get Records interface and call the query or queryAll methods with the appropriate parameters. The specific code to use depends on the SDK you are using (Javascript or Python). You also need to have your API Token and the datasheet ID and view ID for the table you want to retrieve records from. \n',
 'sources': 'https://developers.apitable.com/api/get-records'}

In [13]:
#使用中文回答
from langchain.prompts import PromptTemplate
template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
=========
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other)  right or remedy.\n\n11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the continuation  in force of the remainder of the term (if any) and this Agreement.\n\n11.8 No Agency. Except as expressly stated otherwise, nothing in this Agreement shall create an agency, partnership or joint venture of any  kind between the parties.\n\n11.9 No Third-Party Beneficiaries.
Source: 30-pl
Content: (b) if Google believes, in good faith, that the Distributor has violated or caused Google to violate any Anti-Bribery Laws (as  defined in Clause 8.5) or that such a violation is reasonably likely to occur,
Source: 4-pl
=========
FINAL ANSWER: This Agreement is governed by English law.
SOURCES: 28-pl

QUESTION: What did the president say about Michael Jackson?
=========
Content: Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland.
Source: 0-pl
Content: And we won’t stop. \n\nWe have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. \n\nLet’s use this moment to reset. Let’s stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease.  \n\nLet’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans.  \n\nWe can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n\nOfficer Mora was 27 years old. \n\nOfficer Rivera was 22. \n\nBoth Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n\nI spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves.
Source: 24-pl
Content: And a proud Ukrainian people, who have known 30 years  of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards.  \n\nTo all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. \n\nAnd I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. \n\nTonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world.  \n\nAmerica will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies.  \n\nThese steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. \n\nBut I want you to know that we are going to be okay.
Source: 5-pl
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more.  \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n\nA unity agenda for the nation. \n\nWe can do this. \n\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n\nIn this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n\nWe have fought for freedom, expanded liberty, defeated totalitarianism and terror. \n\nAnd built the strongest, freest, and most prosperous nation the world has ever known. \n\nNow is the hour. \n\nOur moment of responsibility. \n\nOur test of resolve and conscience, of history itself. \n\nIt is in this moment that our character is formed. Our purpose is found. Our future is forged. \n\nWell I know this nation.
Source: 34-pl
=========
FINAL ANSWER: The president did not mention Michael Jackson.
SOURCES:

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER in Chinese:"""
PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])

chain_type_kwargs = {"prompt": PROMPT}
zh_qa = RetrievalQAWithSourcesChain.from_chain_type(ChatOpenAI(temperature=0), chain_type="stuff", chain_type_kwargs=chain_type_kwargs, retriever=docsearch.as_retriever(search_kwargs={"k": 3}))

In [14]:
zh_qa({"question": "我想要获取筛选后的记录，该怎么办？"}, return_only_outputs=True)

{'answer': '要获取筛选后的记录，需要使用API Handbook "Get Records"中的query方法，并传递filterByFormula和sort参数。其中，filterByFormula参数用于筛选记录，sort参数用于按照指定字段排序。具体操作步骤包括获取API Token和datasheet Id，然后使用Javascript SDK中的query方法发送请求。如果使用cURL发送请求，则需要使用encodeURIComponent()函数对查询参数的值进行转义。更多详细信息请参见https://developers.apitable.com/api/get-records。\n',
 'sources': 'https://developers.apitable.com/api/get-records'}

In [15]:
zh_qa({"question": "How can I get the filtered records?"}, return_only_outputs=True)

{'answer': '要获取筛选后的记录，需要使用API Handbook "Get Records"中的查询参数，包括filterByFormula、maxRecords和sort。具体步骤包括获取API Token和datasheet Id，然后使用终端执行相应的代码发送查询请求。如果使用cURL发送查询请求，则需要使用encodeURIComponent()函数转义查询参数的值。具体细节可以参考开发者文档中的说明。 \n',
 'sources': 'https://developers.apitable.com/api/get-records'}

In [16]:
zh_qa({"question": "报错了，401"}, return_only_outputs=True)

{'answer': '这些内容都没有提到“报错了，401”。\n', 'sources': 'N/A'}

In [17]:
query = "报错了，401"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='json\n   {\n    "code": 200,\n    "message": "SUCCESS",\n    "success": true,\n    "data":{\n      "member": \n      {\n        "unitId": "kD8tPcZ3fYxSjV9qWvL5X2TmQbN1nR6",\n        "name": "John",\n        "mobile": {\n            "number": "13000111000",\n            "areaCode": "+86"\n        },\n        "email":"John@apitable.com",\n        "avatar": "https://s1.apitable.com/public/2023/05/16/00a91wbb47fd0594fbc975d2d764a45q",\n        "status": 1,\n        "type": "Member",\n        "teams": [\n          {\n            "unitId": "VS1SejiywaMWbiGMEHAohh62T9EPmmlh",\n            "name": "team A",\n            "sequence": 1,\n            "parentUnitId": "0",\n            "roles": [\n              {\n                  "unitId": "zJ6TuQvH2RtNfSx9eY7XKgD1oWcE5pV",\n                  "name": "role A",\n                  "sequence": 1\n              }\n            ]\n          }\n        ],\n        "roles": [\n          {\n            "unitId": "c9EQqwN1pOEgjXqKJ