## Mise au point de processus de RAG

### Installation

In [None]:
#pip install langchain langchain-openai langchain_community pymupdf yaml

### Imports

In [13]:
import os
import yaml

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI

#from langchain.chains import RetrievalQA
#from langchain.llms import OpenAI

### Configuration

In [14]:
def read_config(file_path):
    with open(file_path, 'r') as file:
        try:
            config = yaml.safe_load(file)
            return config
        except yaml.YAMLError as e:
            print(f"Error reading YAML file: {e}")
            return None

config = read_config("../secrets/config.yaml")


### Initialisation

In [3]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=config["embedding"]["azure_endpoint"],
    azure_deployment=config["embedding"]["azure_deployment"],
    openai_api_version=config["embedding"]["azure_api_version"],
    api_key=config["embedding"]["azure_api_key"]
)

In [4]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [15]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=config["chat"]["azure_endpoint"],
    azure_deployment=config["chat"]["azure_deployment"],
    openai_api_version=config["chat"]["azure_api_version"],
    api_key=config["chat"]["azure_api_key"],
)


### Extraction

In [None]:
file_path = "../samples/B4LFlaparureguydemaupassant.pdf"
loader = PyMuPDFLoader(file_path)


In [7]:
docs = loader.load()
docs[0]

Document(metadata={'producer': 'PDFlib 5.0.3 (C++/Win32)', 'creator': 'Macromedia FlashPaper 2.01.2283.0', 'creationdate': '2006-01-05T17:19:06+01:00', 'source': '../samples/B4LFlaparureguydemaupassant.pdf', 'file_path': '../samples/B4LFlaparureguydemaupassant.pdf', 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2009-11-21T11:58:45+08:00', 'trapped': '', 'modDate': "D:20091121115845+08'00'", 'creationDate': "D:20060105171906+01'00'", 'page': 0}, page_content='LA\xa0PARURE\xa0de\xa0Guy\xa0de\xa0Maupassant\xa0(nouvelle\xa0parue\xa0dans\xa0le\xa0Gaulois\xa0le\xa017\xa0février\xa01884)\xa0\nC\'était\xa0une\xa0de\xa0ces\xa0jolies\xa0et\xa0charmantes\xa0filles,\xa0nées,\xa0comme\xa0par\xa0une\xa0erreur\xa0du\xa0destin,\xa0dans\xa0une\xa0famille\xa0\nd\'employés.\xa0Elle\xa0n\'avait\xa0pas\xa0de\xa0dot,\xa0pas\xa0d\'espérance,\xa0aucun\xa0moyen\xa0d\'être\xa0connue,\xa0comprise,\xa0aimée,\xa0\népousée\xa0par\xa0un\xa0homme\xa0riche

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

In [9]:
len(all_splits)

22

### Indexation

In [10]:
_ = vector_store.add_documents(documents=all_splits)

### Interrogation

In [11]:
def retrieve(store, question: str):
    retrieved_docs = store.similarity_search(question)
    return retrieved_docs

In [42]:
# from langchain import hub

# print(hub.pull("rlm/rag-prompt").messages[0].prompt.template)

In [12]:
def build_qa_messages(question: str, context: str) -> list[str]:
    messages = [
    (
        "system",
        "You are an assistant for question-answering tasks.",
    ),
    (
        "system",
        """Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        {}""".format(context),
    ),
    (  
        "user",
        question
    ),]
    return messages


In [16]:
question = "comment s'appelle l'amie de madame Loisel"
docs_content = "\n\n".join(doc.page_content for doc in retrieve(vector_store, question))
messages = build_qa_messages(question, docs_content)

In [14]:
response = llm.invoke(messages)

In [15]:
print(response.content)

L'amie de Madame Loisel s'appelle Jeanne Forestier.


### Application à un autre fichier

In [68]:
file_path = "../samples/Anomaly_Detection_The_Mathematization_of.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)
new_vector_store = InMemoryVectorStore(embeddings)
_ = new_vector_store.add_documents(documents=all_splits)
question = "liste les ouvrages mentionnés dans le document"
docs_content = "\n\n".join(doc.page_content for doc in retrieve(new_vector_store, question))
messages = build_qa_messages(question, docs_content)
response = llm.invoke(messages)
print(response.content)

Voici la liste des ouvrages mentionnés dans le document :

1. Manuel Delanda, "Philosophy and Simulation: The Emergence of Synthetic Reason," London: Continuum, 2011.
2. Matteo Pasquinelli, “Italian Operaismo and the Information Machine,” Theory, Culture & Society, first published on February 2, 2014.
3. Hans Belting, "Florenz und Bagdad: Eine westöstliche Geschichte des Blicks," Munich: Beck Verlag, 2008.
4. Gabriel Tarde, "The Laws of Imitation," New York: Holt, 1903 [first published in French in 1890].
5. Forensic Architecture (ed.), "Forensis: The Architecture of Public Truth," Berlin: Sternberg Press, 2014.
6. Eyal and Ines Weizman, “Before and After: Documenting the Architecture of Disaster,” Moscow and London: Strelka Press, 2013.


In [69]:
question = "liste les auteurs mentionnés dans le document"
docs_content = "\n\n".join(doc.page_content for doc in retrieve(new_vector_store, question))
messages = build_qa_messages(question, docs_content)
response = llm.invoke(messages)
print(response.content)

Les auteurs mentionnés dans le document sont Manuel Delanda, Matteo Pasquinelli, Hans Belting, Clemens von Wedemeyer, Gabriel Tarde, Forensic Architecture (éditeur), et Georges Canguilhem.


In [71]:
question = "liste les organisations mentionnées dans le document"
docs_content = "\n\n".join(doc.page_content for doc in retrieve(new_vector_store, question))
messages = build_qa_messages(question, docs_content)
response = llm.invoke(messages)
print(response.content)

Les organisations mentionnées dans le document sont : 

1. Une organisation militaire (en lien avec l'utilisation du programme de détection de menaces).
2. DARPA (en lien avec le programme ADAMS: Anomaly Detection at Multiple Scale).
3. Forensic Architecture.


In [6]:
def get_meta_doc(extract: str) -> str:
    messages = [
    (
        "system",
        "You are a librarian extracting metadata from documents.",
    ),
    (
        "user",
        """Extract from the content the following metadata.
        Answer 'unknown' if you cannot find or generate the information.
        Metadata list:
        - title
        - author
        - source
        - type of content (e.g. scientific paper, litterature, news, etc.)
        - language
        - themes as a list of keywords

        <content>
        {}
        </content>
        """.format(extract),
    ),]
    response = llm.invoke(messages)
    return response.content


In [18]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "../samples/article_nature.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)
extract = '\n\n'.join([split.page_content for split in all_splits[:min(10, len(all_splits))]])

In [19]:
print(get_meta_doc(extract))

- title: Effective lung nodule detection using deep CNN with dual attention mechanisms
- author: Zia Ur Rehman, Yan Qiang, Long Wang, Yiwei Shi, Qianqian Yang, Saeed Ullah Khattak, Rukhma Aftab & Juanjuan Zhao
- source: Scientific Reports
- type of content: scientific paper
- language: English
- themes: ['lung cancer', 'deep learning', 'convolutional neural networks', 'dual attention mechanisms', 'medical imaging', 'cancer detection', 'computer-aided diagnosis']
