# Setup


### Load the data


In [1]:
import glob
from tqdm import tqdm
import ast

file_path = "./arxiv_dataset"

# process the docs in the arxiv_dataset directory
# split the docs into the content and metadata, delimited by \n\n\n\n\n
papers = []
metadata = []
failed = []
for file in tqdm(glob.glob(f"{file_path}/*.txt")):
    try:
        with open(file, "r", encoding="utf-8") as f:
            data = f.read().split("\n\n\n\n\n")
            meta = ast.literal_eval(data[0])
            file_id = file.split("\\")[
                -1
            ]  # for non-windows users, use '/' instead of '\\'
            meta["source"] = f"http://arxiv.org/abs/{file_id.replace('.txt', '')}"
            content = data[1]
            papers.append(content)
            metadata.append(meta)
    except:
        failed.append(file)

100%|██████████| 25443/25443 [03:30<00:00, 120.64it/s]


### Split the documents into smaller chunks


#### Character splitter (option 1)


In [None]:
# # recommended one for generic text, split text in this order: ["\n\n", "\n", " ", ""]
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     # Since our LLM has a context size of 8000, we can set the chunk size to a rather big number
#     chunk_size=1500,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
# )

# texts = text_splitter.create_documents(papers, metadatas=metadata)

#### Token splitter (option 2)


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400,  # max 400 tokens per chunk
    chunk_overlap=0,
    disallowed_special=(),
)

texts = text_splitter.create_documents(papers, metadatas=metadata)

In [3]:
import pickle

# save the split docs
with open("processed_arxiv_dataset.pkl", "wb") as f:
    pickle.dump(texts, f)

### Create the embeddings model


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

model_kwargs = {"device": "cuda"}  # change to 'cpu' if you don't have a GPU
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs=model_kwargs,
)

### Load the documents into the vector database


In [7]:
from langchain.vectorstores import Chroma

vector_store = Chroma(
    "cs_paper_store",
    embeddings_model,
    persist_directory="./chromadb",
)


def batch_add_documents(doc_list, batch_size):
    total_docs = len(doc_list)
    with tqdm(total=total_docs, desc="Adding documents", unit="docs") as progress_bar:
        for i in range(0, total_docs, batch_size):
            batch = doc_list[i : i + batch_size]
            vector_store.add_documents(batch)
            progress_bar.update(len(batch))


# Specify the batch size
batch_size = 5300

# Add documents in batches and show progress using tqdm
batch_add_documents(texts, batch_size)

Adding documents: 100%|██████████| 1414155/1414155 [3:47:30<00:00, 103.60docs/s] 


### Issue with long contexts

No matter the architecture of your model, there is a substantial performance degradation when you include 10+ retrieved documents. In brief: When models must access relevant information in the middle of long contexts, they tend to ignore the provided documents. See: https://arxiv.org/abs/2307.03172

To avoid this issue you can re-order documents after retrieval to avoid performance degradation.

Retrieved from: https://python.langchain.com/docs/modules/data_connection/document_transformers/post_retrieval/long_context_reorder


### Download the LLM


In [None]:
# uncomment the line if needed
# !wget https://gpt4all.io/models/gguf/mistral-7b-openorca.Q4_0.gguf -O mistral-7b-openorca.Q4_0.gguf

# Assemble the RAG System


In [1]:
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# rag_template = """You are a helpful bot who reads texts and answers questions about them.
# Your expertise lies in the field of academic research and your main priority is ensuring that your answers are correct.
# You will be given relevant texts, where each text is enclosed in ``` triple back ticks, and you should derive your answer from the texts as far as possible.
# You must to attach a citation for your answers using the texts provided.
# If you encounter a question that you do not know how to answer, just mention that you do not know the answer.

# Relevant Text: {text}

# Question: {question}

# Answer: """

# prompt = PromptTemplate(template=rag_template, input_variables=["text, " "question"])

callbacks = [StreamingStdOutCallbackHandler()]

### Load the vector_store


In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_kwargs = {"device": "cuda"}  # change to 'cpu' if you don't have a GPU
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs=model_kwargs,
)

vector_store = Chroma(
    "cs_paper_store",
    embeddings_model,
    persist_directory="./chromadb",
)

### Instantiate the LLM


In [3]:
# source code: https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/llms/gpt4all.py

model_name = "./mistral-7b-openorca.Q4_0.gguf"
# Verbose is required to pass to the callback manager
llm = GPT4All(
    model=model_name,
    streaming=True,
    max_tokens=8000,
    n_predict=4096,
    callbacks=callbacks,
    verbose=True,
    echo=True,
    device="gpu",  # switch to "cpu" if you don't have a GPU
)

### Create the retriever

We will be using the MultiQueryRetriever.


In [9]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(),
    llm=llm,
)

# Test the retriever
question = "What are the recent advancements in large language models?"
unique_docs = multi_query_retriever.get_relevant_documents(query=question)



1) Discussing the latest developments in AI and machine learning, particularly focusing on significant progress made in large language models.
2) Exploring the most current breakthroughs within artificial intelligence, specifically regarding major improvements in large-scale language processing systems.
3) Investigating recent advancements in natural language understanding technology, with a focus on cutting-edge innovations in large language model research.

#### Another possible retriever: WebResearchRetriever


In [None]:
# from langchain.utilities import GoogleSearchAPIWrapper
# from langchain.retrievers.web_research import WebResearchRetriever
# import logging
# from dotenv import load_dotenv
# import os

# load_dotenv()
# logging.basicConfig()
# logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)

# search = GoogleSearchAPIWrapper()
# web_research_retriever = WebResearchRetriever.from_llm(
#     vectorstore=vector_store,
#     llm=llm,
#     search=search,
# )
# # Test the retriever
# docs = web_research_retriever.get_relevant_documents(question)

### Ensemble Retriever allows the combination of BM25 and semantic search (Requires a large amount of RAM, >32gb)


In [12]:
# from langchain.retrievers import BM25Retriever, EnsembleRetriever

# bm25_retriever = BM25Retriever.from_documents(texts)
# bm25_retriever.k = 2

# ensemble_retriever = EnsembleRetriever(
#     retrievers=[bm25_retriever, vector_store.as_retriever()],
#     weights=[0.5, 0.5],
# )

# # test the ensemble retriever
# docs = ensemble_retriever.get_relevant_documents(query=question)
# docs

### Use RetrievalQA to generate citations


In [11]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    callbacks=callbacks,
)

query = "What is a support vector machine?"
result = qa({"query": query})

 A support vector machine (SVM) is a machine learning algorithm based on the principle of structural risk minimisation, which is based on statistical learning theory and automatically adjusts the model structure by controlling parameters to achieve empirical and structural risk minimisation. For non-linear problems, such as disk failure prediction problems, SVM uses kernel functions to map the input data into a high-dimensional space to achieve linear separability of the high-dimensional space, thus transforming non-linear problems into linear problems.

In [12]:
result["result"]

' A support vector machine (SVM) is a machine learning algorithm based on the principle of structural risk minimisation, which is based on statistical learning theory and automatically adjusts the model structure by controlling parameters to achieve empirical and structural risk minimisation. For non-linear problems, such as disk failure prediction problems, SVM uses kernel functions to map the input data into a high-dimensional space to achieve linear separability of the high-dimensional space, thus transforming non-linear problems into linear problems.'

In [13]:
result["source_documents"]

[Document(page_content='[17] Chih-Chung Chang and Chih-Jen Lin, LIBSVM: a library for support \nvector machines. ACM Transactions on Intelligent Systems and \nTechnology, \n2:27:1- \n27:27, \n2011. \nSoftware \navailable \nat \nhttp://www.csie.ntu.edu.tw/ cjlin/libsvm.', metadata={'Authors': 'A. Elmaizi, E. Sarhrouni, A. Hammouch, C. Nacir', 'Published': '2022-10-26', 'Summary': 'The high dimensionality of hyperspectral images consisting of several bands\noften imposes a big computational challenge for image processing. Therefore,\nspectral band selection is an essential step for removing the irrelevant, noisy\nand redundant bands. Consequently increasing the classification accuracy.\nHowever, identification of useful bands from hundreds or even thousands of\nrelated bands is a nontrivial task. This paper aims at identifying a small set\nof highly discriminative bands, for improving computational speed and\nprediction accuracy. Hence, we proposed a new strategy based on joint mutual\ni