In [None]:
#pip install llama-index
#pip install llama-index-readers-file
#pip install llama-index-embeddings-azure-openai
#pip install llama-index-llms-azure-openai

In [1]:
import yaml

In [2]:
def read_config(file_path):
    with open(file_path, 'r') as file:
        try:
            config = yaml.safe_load(file)
            return config
        except yaml.YAMLError as e:
            print(f"Error reading YAML file: {e}")
            return None

config = read_config("../secrets/config.yaml")

In [3]:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

llm = AzureOpenAI(
    model=config["chat"]["azure_deployment"],
    deployment_name=config["chat"]["azure_deployment"],
    api_key=config["chat"]["azure_api_key"],
    azure_endpoint=config["chat"]["azure_endpoint"],
    api_version=config["chat"]["azure_api_version"],
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model=config["embedding"]["azure_deployment"],              # for the moment, same as deployment
    deployment_name=config["embedding"]["azure_deployment"],
    api_key=config["embedding"]["azure_api_key"],
    azure_endpoint=config["embedding"]["azure_endpoint"],
    api_version=config["embedding"]["azure_api_version"],
)

In [4]:
from llama_index.readers.file import PyMuPDFReader

In [5]:
loader = PyMuPDFReader()
documents = loader.load(file_path="../samples/Anomaly_Detection_The_Mathematization_of.pdf")

# documents = SimpleDirectoryReader(
#     input_files=["../../data/paul_graham/paul_graham_essay.txt"]
# ).load_data()

In [6]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

index = VectorStoreIndex.from_documents(documents)

In [7]:
query = "liste les ouvrages mentionnés dans le document"
query_engine = index.as_query_engine()
answer = query_engine.query(query)

print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)

> Source (Doc id: 010d5215-a431-4f15-a61f-6f4b9867957b): Anomaly Detection: The Mathematization of 
the Abnormal in the Metadata Society
Matteo Pasquinell...

> Source (Doc id: 67c3a1ce-bfc3-40be-bd22-0c6e3b8efd67): different  future  for  information  surplus  and  its  epistemic  potentiality.  Aside  from  th...
query was: liste les ouvrages mentionnés dans le document
answer was: Le document mentionne l'ouvrage "Before and After: Documenting the Architecture of Disaster" par Eyal et Ines Weizman.


In [8]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [9]:
text_chunks

['Anomaly Detection: The Mathematization of\u2028\nthe Abnormal in the Metadata Society\nMatteo Pasquinelli \n‘  \nCanonical example of apophenia: a ‘human face’ recognized on the surface of Mars\u2028\n(photo: NASA, 25 July 25, 1976, Wikipedia Commons)\n!1',
 'Introduction\nIn a book from 1890 the French sociologist and criminologist Gabriel Tarde was already \nrecording the rise of information surplus and envisioning a bright future for the discipline of \nstatistics as the new eye of mass media (that is as a new computational or algorithmic eye, \nwe would say today). In his biomorphic metaphors, he wrote: \nThe public journals will become socially what our sense organs are vitally. Every printing \noffice will become a mere central station for different bureaus of statistics just as the ear-drum \nis a bundle of acoustic nerves, or as the retina is a bundle of special nerves each of which \nregisters its characteristic impression on the brain. At present Statistics is a kind of emb

In [10]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [11]:
print(text_chunk)

different  future  for  information  surplus  and  its  epistemic  potentiality.  Aside  from  the 
defense of privacy and the regulation of the algorithmic panopticon, other political strategies 
must be explored. We need maybe to invent new institutions to intervene at the same scale 
of computation of governments, to reclaim massive computing power as a basic right of 
‘civil society’ and its autonomy.
I’d like to conclude going back to the issue of enemy recognition and the perspective 
of the world from the eye of the algorithm. In a short chapter titled “Algorithmic Vision”, 
Eyal and Ines Weizman stress that “the technology of surveillance and destruction are the 
same as those used in forensics to monitor these violations”. The practice of the Forensic 
Architecture  project  has  shown  in  different  cases  that  the  same  technologies  that  are 
involved in war crimes as apparatus of vision, control and decision can be reversed into a 
political tool. They continue: 
But e

In [12]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [15]:
# from llama_index.vector_stores.postgres import PGVectorStore

# vector_store = PGVectorStore.from_params(
#     database=db_name,
#     host=host,
#     password=password,
#     port=port,
#     user=user,
#     table_name="llama2_paper",
#     embed_dim=384,  # openai embedding dimension
# )

from llama_index.core.vector_stores import SimpleVectorStore

vector_store = SimpleVectorStore()
vector_store.add(nodes)

['f3896a0c-6dc1-4fde-943c-a9772a64cc0f',
 '0a769d33-24e4-440a-8ce0-872199bd0385',
 'a0bcb101-a1f8-4cdc-b3e9-a75127d71925',
 'e728acbb-9064-461c-a421-273ae0d00441',
 '24e8293b-943a-40de-aaa4-618f6ee5d69d',
 'a49141bb-5709-49b8-b640-2ccd57a47316',
 'd6c62edd-63fb-40ee-ae1f-5e34771a207d',
 '28b0a1e7-8f93-464a-ab49-4ddf055a522d',
 'cff59ce4-c387-45cb-a27a-18e935e8067c',
 'bd34ab7e-2050-4422-98cb-30869891916b']

In [16]:
len(nodes)

10

In [20]:
query_str = "list all books mentioned in the document"
query_embedding = embed_model.get_query_embedding(query_str)
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=5, mode=query_mode
)

# returns a VectorStoreQueryResult
query_result = vector_store.query(vector_store_query)
if query_result.nodes:
    print(query_result.nodes[0].get_content())
else:
    print('No results')

No results


In [19]:
query_result

VectorStoreQueryResult(nodes=None, similarities=[0.27762520881909986, 0.2719230929370131, 0.2678803739071994, 0.2617933434172526, 0.26074597369817437], ids=['f3896a0c-6dc1-4fde-943c-a9772a64cc0f', 'cff59ce4-c387-45cb-a27a-18e935e8067c', '0a769d33-24e4-440a-8ce0-872199bd0385', '24e8293b-943a-40de-aaa4-618f6ee5d69d', 'bd34ab7e-2050-4422-98cb-30869891916b'])

In [21]:
vector_store.to_dict()

{'embedding_dict': {'f3896a0c-6dc1-4fde-943c-a9772a64cc0f': [-0.03324735537171364,
   -0.021838298067450523,
   -0.014390026219189167,
   -0.0164243932813406,
   -0.021140800788998604,
   0.006555641070008278,
   -0.0020779597107321024,
   0.03680126741528511,
   0.0006108288071118295,
   0.05666332319378853,
   -0.01316110324114561,
   0.003991924691945314,
   -0.0016347585478797555,
   -0.035871271044015884,
   -0.0013181861722841859,
   -0.03686769679188728,
   -0.01732117496430874,
   -0.018799204379320145,
   0.02652149088680744,
   -0.018151527270674706,
   -0.016515731811523438,
   -0.041019465774297714,
   -0.050850849598646164,
   0.025342389941215515,
   0.01458100788295269,
   0.03085593692958355,
   0.011940483935177326,
   -0.00021044272580184042,
   -0.0495222844183445,
   0.013617797754704952,
   0.019563129171729088,
   0.009341477416455746,
   0.016100555658340454,
   -0.03369574621319771,
   -0.0016534414608031511,
   -0.007718136068433523,
   0.022901149466633797,
  

In [None]:
CHUNK_SIZE = 1_000
CHUNK_OVERLAP = 200

embedder = AzureOpenAIEmbedding(
    model=config["embedding"]["azure_deployment"],              # for the moment, same as deployment
    deployment_name=config["embedding"]["azure_deployment"],
    api_key=config["embedding"]["azure_api_key"],
    azure_endpoint=config["embedding"]["azure_endpoint"],
    api_version=config["embedding"]["azure_api_version"],
)

vector_store = SimpleVectorStore()


def store_pdf_file(file_path: str, doc_name: str):
    """Store a pdf file in the vector store.

    Args:
        file_path (str): file path to the PDF file
    """
    loader = PyMuPDFReader()
    documents = loader.load(file_path)

    text_parser = SentenceSplitter(chunk_size=CHUNK_SIZE)
    text_chunks = []
    # maintain relationship with source doc index, to help inject doc metadata in (3)
    doc_idxs = []
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))

    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        print(node.id_)
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        nodes.append(node)

    for node in nodes:
        node_embedding = embedder.get_text_embedding(
            node.get_content(metadata_mode="all")
        )
        node.embedding = node_embedding

    vector_store.add(nodes)
    return


In [55]:
store_pdf_file('../samples/B4LFlaparureguydemaupassant.pdf', 'B4LFlaparureguydemaupassant.pdf')

2c4bf19b-9f06-4f42-9f71-df06268aca87
1e171907-403d-4d83-9b9b-ee839e69c22c
95a9c9bf-7e3d-463b-bd1c-3c62c5029897
b8550ecb-f8b7-4b19-9e15-575172812b12
b2134439-1a5c-4f63-9347-4b20f85ff26c
1f244413-49fb-481e-a94d-2eb8f7905320
8473aaf3-b7fe-46d0-b1fe-f80847371cd8
8c442cab-5e2a-4de5-9f3b-ac485572600e
c93868cc-6582-42c1-b02d-42858ddedc3c
a7cc7e45-fde4-4060-921d-6625e21ceb63


In [35]:
def retrieve(question: str):
    """Retrieve documents similar to a question.

    Args:
        question (str): text of the question

    Returns:
        list[TODO]: list of similar documents retrieved from the vector store
    """
    query_embedding = embedder.get_query_embedding(question)

    query_mode = "default"
    # query_mode = "sparse"
    # query_mode = "hybrid"

    vector_store_query = VectorStoreQuery(
        query_embedding=query_embedding, similarity_top_k=5, mode=query_mode
    )

    # returns a VectorStoreQueryResult
    query_result = vector_store.query(vector_store_query)
    return query_result.nodes

    # if query_result.nodes:
    #     print(query_result.nodes[0].get_content())
    # else:
    #     print('No results')


def build_qa_messages(question: str, context: str) -> list[str]:
    messages = [
    (
        "system",
        "You are an assistant for question-answering tasks.",
    ),
    (
        "system",
        """Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        {}""".format(context),
    ),
    (  
        "user",
        question
    ),]
    return messages


def answer_question(question: str) -> str:
    """Answer a question by retrieving similar documents in the store.

    Args:
        question (str): text of the question

    Returns:
        str: text of the answer
    """
    docs = retrieve(question)
    docs_content = "\n\n".join(doc.get_content() for doc in docs)
    print("Question:", question)
    print("------")
    for doc in docs:
        print("Chunk:", doc.id)
        print(doc.page_content)
        print("------")
    messages = build_qa_messages(question, docs_content)
    response = llm.invoke(messages)
    return response.content


In [37]:
retrieve("comment s'appelle l'amie de madame Loisel") is None

True

In [46]:
vector_store.query(VectorStoreQuery(
        query_embedding=embedder.get_query_embedding("comment s'appelle l'amie de madame Loisel"),
        similarity_top_k=5,
        mode=query_mode
    ))

VectorStoreQueryResult(nodes=None, similarities=[0.6250752242488372, 0.6114351711499516, 0.610400154295405, 0.5965252226761711, 0.5913027844093046], ids=['fb879539-2061-4120-81e7-73fd2ed3f69f', '4aea7873-c42a-4262-ad43-f514d1b5b7c1', '47d1a460-c2c1-4250-8201-b2aa78ff82b1', '7dcbe8a4-90bc-47e4-b476-aa876aa5033f', 'bf6a77ff-2461-446d-92ec-91d2fbcc8f55'])

In [47]:
vector_store.get('fb879539-2061-4120-81e7-73fd2ed3f69f')

[-0.049263447523117065,
 0.008291070349514484,
 0.008986261673271656,
 0.027119776234030724,
 0.0008506945450790226,
 -0.006512844003736973,
 0.0026435561012476683,
 -0.01972879469394684,
 -0.01506003737449646,
 0.02296326495707035,
 -0.018616490066051483,
 -0.005612754262983799,
 0.010479093529284,
 -0.040716253221035004,
 0.042384710162878036,
 -0.027924733236432076,
 -0.008064217865467072,
 -0.01310618408024311,
 -0.009952211752533913,
 3.398776243557222e-05,
 -0.028422344475984573,
 -0.018353048712015152,
 0.00821057427674532,
 -0.025056155398488045,
 0.004705347120761871,
 -0.009037486277520657,
 -0.018572581931948662,
 0.00753001868724823,
 0.010984021238982677,
 -0.00682385079562664,
 -0.005228569731116295,
 0.006981183309108019,
 0.002444145968183875,
 -0.05909857153892517,
 -0.020109321922063828,
 -0.017079750075936317,
 0.026095284149050713,
 0.008430108428001404,
 0.039633218199014664,
 -0.006454301532357931,
 -0.03351553529500961,
 -0.005634707864373922,
 0.0013208633754402