In [9]:
import os, glob

from langchain_community.document_loaders import PyPDFLoader

file_path = "/mnt/d/LLM-Project/FinalRAG-Retrieval/ARXIV_SOURCES"
files = glob.glob(file_path + "/*")
idx = 10 
pdf = os.path.join(files[idx], files[idx].split("/")[-1] + ".pdf")

loader = PyPDFLoader(pdf)

data = loader.load_and_split()

In [16]:
import arxiv
client = arxiv.Client()
search = arxiv.Search(id_list=[files[idx].split("/")[-1]])
paper = next(arxiv.Client().results(search))
meta_data = {"arxiv_id": paper.entry_id, 
            "title": paper.title, 
            "categories" : '\n'.join([f'{i+1}. {cat}' for i, cat in enumerate(paper.categories)]),
            "primary_category": paper.primary_category,
            "published": str(paper.published),
            "authors": '\n'.join([f'{i+1}. {auth.name}' for i, auth in enumerate(paper.authors)])
            }

In [17]:
print(f"""{data[0].page_content}""")

Deep-inelastic electron-deuteron scattering with spectator nucleon tagging
at the electron-ion collider. Extracting free nucleon structure
Alexander Jentsch,1Zhoudunming Tu,1, 2and Christian Weiss3
1Department of Physics, Brookhaven National Laboratory, Upton, New York 11973, USA∗
2Center for Frontiers in Nuclear Science, Stony Brook, New York 11794, USA†
3Theory Center, Jeﬀerson Lab, Newport News, Virginia 23606, USA‡
(Dated: August 20, 2021)
Background: Deep-inelastic scattering (DIS) on the deuteron with spectator nucleon tagging represents a unique
method for extracting the free neutron structure functions and exploring the nuclear modiﬁcations of bound
protons and neutrons. The detection of the spectator (with typical momenta ≲100 MeV/cin the deuteron
rest frame) controls the nuclear conﬁguration during the DIS process and enables a diﬀerential analysis of nuclear
eﬀects. At the future electron-ion collider (EIC) such measurements will be performed using far-forward detectors.
Pur

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
texts = text_splitter.create_documents([data[0].page_content], metadatas = [meta_data]+ [data[0].metadata])

In [20]:
data[0].metadata

{'source': '/mnt/d/LLM-Project/FinalRAG-Retrieval/ARXIV_SOURCES/2108.08314v1/2108.08314v1.pdf',
 'page': 0}

In [19]:
texts

[Document(page_content='Deep-inelastic electron-deuteron scattering with spectator nucleon tagging\nat the electron-ion collider. Extracting free nucleon structure\nAlexander Jentsch,1Zhoudunming Tu,1, 2and Christian Weiss3', metadata={'arxiv_id': 'http://arxiv.org/abs/2108.08314v1', 'title': 'Deep-inelastic electron-deuteron scattering with spectator nucleon tagging at the electron-ion collider. Extracting free nucleon structure', 'categories': '1. hep-ph\n2. hep-ex\n3. nucl-ex', 'primary_category': 'hep-ph', 'published': '2021-08-18 18:00:04+00:00', 'authors': '1. Alexander Jentsch\n2. Zhoudunming Tu\n3. Christian Weiss'}),
 Document(page_content='1Department of Physics, Brookhaven National Laboratory, Upton, New York 11973, USA∗\n2Center for Frontiers in Nuclear Science, Stony Brook, New York 11794, USA†', metadata={'arxiv_id': 'http://arxiv.org/abs/2108.08314v1', 'title': 'Deep-inelastic electron-deuteron scattering with spectator nucleon tagging at the electron-ion collider. Extra

In [19]:

from langchain_openai.embeddings import OpenAIEmbeddings
os.environ["OPENAI_API_KEY"] = "api_key"
embeddings = OpenAIEmbeddings()
from langchain_community.vectorstores import Chroma

# save to disk
db = Chroma.from_documents(texts, embeddings, persist_directory="./chroma_db")

In [22]:
help(db.similarity_search)

Help on method similarity_search in module langchain_community.vectorstores.chroma:

similarity_search(query: 'str', k: 'int' = 4, filter: 'Optional[Dict[str, str]]' = None, **kwargs: 'Any') -> 'List[Document]' method of langchain_community.vectorstores.chroma.Chroma instance
    Run similarity search with Chroma.
    
    Args:
        query (str): Query text to search for.
        k (int): Number of results to return. Defaults to 4.
        filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
    
    Returns:
        List[Document]: List of documents most similar to the query text.



In [21]:

query = "What is Sartre generator?"
db.similarity_search(query)

[Document(page_content='1 Introduction\nThe Monte Carlo event generator Sar tre[1–3]was the ﬁrst model based on event-by-event ﬂuctu-', metadata={'arxiv_id': 'http://arxiv.org/abs/2108.01944v1', 'authors': '1. Tobias Toll', 'category': 'hep-ph', 'published': '2021-08-04 10:17:28+00:00', 'summary': "Sartre has been extensively used for describing photon-nuclei processes at\nthe electron-ion collider (EIC) as well as ultra-peripheral collisions (UPC) at\nLHC and RHIC. Sartre is an event generator which implements the dipole model\nfor DIS, and models the transverse geometry of the target nucleus or proton in\ncoordinate space. It uses the Good-Walker mechanism for simulating fluctuations\nwhich contribute to the incoherent cross section for which the target breaks up\nafter the interaction. With improved precision of UPC measurements in the last\nyears, a detailed test of the dipole model has become possible, and Sartre's\nmodel was found lacking. In these proceedings we add subnucleon f

In [None]:
vec = embeddings.embed_query(texts[0].page_content)

In [None]:
len(vec)

In [None]:
import chromadb, os

os.environ["ALLOW_RESET"] = "TRUE"

chroma_client = chromadb.PersistentClient(path = "test.db")

chroma_client.heartbeat()
chroma_client.reset()


In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.api.types import Images
import numpy as np
from typing import cast, TypeVar, Union
from typing_extensions import Protocol

Embeddable = Union[Documents, Images]
D = TypeVar("D", bound=Embeddable, contravariant=True)
class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: D) -> Embeddings:
        # embed the documents somehow
        return  [np.random.uniform(low = 0, high = 4, size = 3).tolist() for _ in range(len(input))]

collection = chroma_client.get_or_create_collection(name="new_collection", embedding_function=MyEmbeddingFunction())

In [None]:

collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

In [None]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)

In [None]:
results