In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os

In [2]:
from langchain.document_loaders import DirectoryLoader

docs_loader = DirectoryLoader('./apitable_devbook/docs', glob="**/*", use_multithreading=True, show_progress=True)

docs = docs_loader.load()

need_del_list = [ index for index in range(0, len(docs)) if docs[index].page_content == "" ]

for index in need_del_list:
    del docs[index]

for index in range(0, len(docs)):
    # 提取出文件名称，例如从 apitable_devbook/current/list-the-team-members.md 提取出 list-the-team-members
    docs[index].metadata['source_file'] =  docs[index].metadata['source']
    filename = os.path.splitext(os.path.basename(docs[index].metadata['source']))[0]
    docs[index].metadata['source'] = 'https://developers.apitable.com/api/' + filename
    if docs[index].page_content == "":
        del docs[index]

reference_loader = DirectoryLoader('./apitable_devbook/reference', glob="**/*", use_multithreading=True, show_progress=True)
reference = reference_loader.load()
need_del_list = [ index for index in range(0, len(reference)) if reference[index].page_content == "" ]

for index in need_del_list:
    del reference[index]

for index in range(0, len(reference)):
    reference[index].metadata['source_file'] =  reference[index].metadata['source']
    reference[index].metadata['source'] = 'https://developers.apitable.com/api/reference'
documents = docs + reference
len(documents)

100%|██████████| 34/34 [00:03<00:00, 10.59it/s]
 92%|█████████▏| 11/12 [00:00<00:00, 34.62it/s]


45

In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embed_docs = text_splitter.split_documents(documents)

Created a chunk of size 2226, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 7906, which is longer than the specified 1000
Created a chunk of size 1302, which is longer than the specified 1000
Created a chunk of size 2437, which is longer than the specified 1000
Created a chunk of size 2149, which is longer than the specified 1000
Created a chunk of size 1514, which is longer than the specified 1000
Created a chunk of size 1619, which is longer than the specified 1000
Created a chunk of size 1831, which is longer than the specified 1000
Created a chunk of size 1426, which is longer than the specified 1000
Created a chunk of size 1102, which is longer than the specified 1000


In [4]:
embeddings = OpenAIEmbeddings()
persist_directory = 'db'
vectordb = Chroma.from_documents(embed_docs, embeddings, persist_directory=persist_directory)
vectordb.persist()
vectordb = None

Using embedded DuckDB with persistence: data will be stored in: db


In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

qa = RetrievalQAWithSourcesChain.from_chain_type(ChatOpenAI(temperature=0), chain_type="stuff", retriever=docsearch.as_retriever())

Using embedded DuckDB with persistence: data will be stored in: db


In [6]:
query = "What did the president say about Ketanji Brown Jackson"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='The data structure of the Member is as follows.', metadata={'source': 'https://developers.apitable.com/api/reference', 'source_file': 'apitable_devbook/reference/api/members.md'}),
  0.5979182124137878),
 (Document(page_content='The data structure of the Team is as follows.', metadata={'source': 'https://developers.apitable.com/api/reference', 'source_file': 'apitable_devbook/reference/api/teams.md'}),
  0.6052681803703308),
 (Document(page_content='json\n   {\n    "code": 200,\n    "message": "SUCCESS",\n    "success": true,\n    "data":{\n      "member": \n      {\n        "unitId": "kD8tPcZ3fYxSjV9qWvL5X2TmQbN1nR6",\n        "name": "John",\n        "mobile": {\n            "number": "13000111000",\n            "areaCode": "+86"\n        },\n        "email":"John@apitable.com",\n        "avatar": "https://s1.apitable.com/public/2023/05/16/00a91wbb47fd0594fbc975d2d764a45q",\n        "status": 1,\n        "type": "Member",\n        "teams": [\n          {\n   

In [8]:
query = "我怎么获取表格中的记录？"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='```js\nimport { APITable } from \'apitable\';\nconst apitable = new APITable({\n  token: \'Your API Token\',\n});\nconst datasheet = apitable.datasheet("dstWUHwzTHd2YQaXEE");\n// Get records in pages, return to the first page by default\ndatasheet.records.query({ viewId: "viw4mnkqkaqdh"}).then(response => {\nif (response.success) {\n  console.log(response.data.records);\n} else {\n  console.error(response);\n}\n});\n// Automatically handles paging and iteratively returns all records.\nconst recordsIter = datasheet.records.queryAll({ viewId: "viw4mnkqkaqdh"})\n// for await needs to be run in an async function and requires a browser/node version.See https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Statements/for-await... .of\nfor await (const eachPageRecords of recordsIter){\n  console.log(eachPageRecords)\n}\n```\nThe query and queryAll methods support passing in multiple parameters to customize the set of records returned.The supported paramete

In [9]:
query = "How can I get the records from a table?"
similar_docs = docsearch.similarity_search_with_score(query)
similar_docs

[(Document(page_content='title: Get Records\n\nmdx-code-block\nimport Tabs from \'@theme/Tabs\';\nimport TabItem from \'@theme/TabItem\';\n\nThis text provides an example of how to call Get records interface.\n\nExample 1: Get all records in the specified view under the specified APITable\n\nSuppose you have a datasheet, and you want to get all the records in the "Grid View".\n\nYour action steps below:\n\nGet your API Token.(How to get it)\n\nGet the dastasheet ID(How to get it) and \'Grid view\' ID(How to get it).\n\nOpen the terminal on your computer, execute the following code and send the query request to the server (assuming datasheetId is dstWUHwzTH2YQaXEE, viewId is viw4mnkqkaqdh):\n````mdx-code-block\n<Tabs\ngroupId="get records"\ndefaultValue="cURL"\nvalues={\n    [\n    { label: "cURL", value: \'cURL\', },\n    { label: "Javascript SDK", value: \'Javascript SDK\', },\n    { label: "Python SDK", value: \'Python SDK\', },\n    ]\n}', metadata={'source': 'https://developers.api

In [10]:
qa({"question": "我怎么获取表格中的记录？"}, return_only_outputs=True)

{'answer': 'To get records from the table, you can use the `query` or `queryAll` methods with parameters to customize the set of records returned. The supported parameters are consistent with the query parameters in API Reference "Get Records". \n',
 'sources': 'https://developers.apitable.com/api/get-records'}

In [11]:
qa({"question": "How can I get the records from a table?"}, return_only_outputs=True)

{'answer': 'To get the records from a table, you need to use the Get Records interface and call the query or queryAll methods with the appropriate parameters. The specific steps and code depend on the programming language and SDK being used. Sources: https://developers.apitable.com/api/get-records',
 'sources': ''}