In [6]:
! pip install langchain openai chromadb tiktoken rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os
from getpass import getpass

OPENAI_API_KEY = getpass()

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

embedding = OpenAIEmbeddings()


# Load pdf
loader = PyPDFLoader("https://arxiv.org/pdf/2309.10305v2.pdf")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data[:6])


 ························································


In [7]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [8]:
bm25_retriever = BM25Retriever.from_documents(
    documents=splits
)
bm25_retriever.k = 4

In [9]:
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

retriever = vectordb.as_retriever(search_kwargs={"k": 4})

In [10]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5]
)

In [12]:
docs = ensemble_retriever.invoke("What is baichuan2 ？")
docs

[Document(page_content='users and tasks, the specific behavior of each\ntask is unpredictable, often leading to idle GPU\nnodes within the cluster. Considering that a single\nmachine equipped with eight A800 GPUs could\nadequately meet the memory requirements for our\nBaichuan 2-7B and Baichuan 2-13B models, the\n4https://scipy.org/primary design criterion for our training framework\nis the machine-level elasticity, which supports that\nresources for tasks can be dynamically modified', metadata={'source': 'https://arxiv.org/pdf/2309.10305v2.pdf', 'page': 5}),
 Document(page_content='In this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baichuan 1 (Baichuan, 2023b,a).\nWith such a massive a