# 使用 Airbyte 从 GitHub 加载数据

本笔记本演示如何使用 Airbyte 从 GitHub 加载数据，并使用向量存储进行搜索。

In [None]:
# 安装所需的包
%pip install -qU langchain-airbyte langchain_chroma

Note: you may need to restart the kernel to use updated packages.


In [None]:
import getpass

# 获取 GitHub 访问令牌（会以安全方式提示输入）
GITHUB_TOKEN = getpass.getpass()

In [None]:
from langchain_airbyte import AirbyteLoader
from langchain_core.prompts import PromptTemplate

# 创建 Airbyte 加载器，配置为从 GitHub 加载拉取请求
loader = AirbyteLoader(
    source="source-github",
    stream="pull_requests",
    config={
        "credentials": {"personal_access_token": GITHUB_TOKEN},
        "repositories": ["langchain-ai/langchain"],
    },
    # 定义如何格式化拉取请求的内容
    template=PromptTemplate.from_template(
        """# {title}
作者：{user[login]}

{body}"""
    ),
    include_metadata=False,
)
docs = loader.load()

In [None]:
# 打印倒数第二个文档的内容进行预览
print(docs[-2].page_content)

# Updated partners/ibm README
by williamdevena

## PR title
partners: changed the README file for the IBM Watson AI integration in the libs/partners/ibm folder.

## PR message
Description: Changed the README file of partners/ibm following the docs on https://python.langchain.com/docs/integrations/llms/ibm_watsonx

The README includes:

- Brief description
- Installation
- Setting-up instructions (API key, project id, ...)
- Basic usage:
  - Loading the model
  - Direct inference
  - Chain invoking
  - Streaming the model output
  
Issue: https://github.com/langchain-ai/langchain/issues/17545

Dependencies: None

Twitter handle: None


In [None]:
# 显示加载的文档总数
len(docs)

10283

In [None]:
import tiktoken
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# 初始化 tiktoken 编码器
enc = tiktoken.get_encoding("cl100k_base")

# 创建向量存储，使用 OpenAI 嵌入
vectorstore = Chroma.from_documents(
    docs,
    embedding=OpenAIEmbeddings(
        disallowed_special=(enc.special_tokens_set - {"<|endofprompt|>"})
    ),
)

In [None]:
# 创建检索器
retriever = vectorstore.as_retriever()

In [None]:
# 搜索与 IBM 相关的拉取请求
retriever.invoke("与 IBM 相关的拉取请求")

[Document(page_content='# Updated partners/ibm README\nby williamdevena\n\n## PR title\r\npartners: changed the README file for the IBM Watson AI integration in the libs/partners/ibm folder.\r\n\r\n## PR message\r\nDescription: Changed the README file of partners/ibm following the docs on https://python.langchain.com/docs/integrations/llms/ibm_watsonx\r\n\r\nThe README includes:\r\n\r\n- Brief description\r\n- Installation\r\n- Setting-up instructions (API key, project id, ...)\r\n- Basic usage:\r\n  - Loading the model\r\n  - Direct inference\r\n  - Chain invoking\r\n  - Streaming the model output\r\n  \r\nIssue: https://github.com/langchain-ai/langchain/issues/17545\r\n\r\nDependencies: None\r\n\r\nTwitter handle: None'),
 Document(page_content='# Updated partners/ibm README\nby williamdevena\n\n## PR title\r\npartners: changed the README file for the IBM Watson AI integration in the `libs/partners/ibm` folder. \r\n\r\n\r\n\r\n## PR message\r\n- **Description:** Changed the README fi