In [1]:
!pip install -U langchain-text-splitters
!pip install -U langchain-community langchain-text-splitters langchain
!pip install pypdf
!pip install -U langchain-huggingface
!pip install -qU langchain-chroma
!pip install -qU langchain
!pip install -qU langchain-groq
!pip install sentence-transformers
!pip install -q gradio

Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Downloading langchain_text_splitters-1.1.0-py3-none-any.whl (34 kB)
Installing collected packages: langchain-text-splitters
Successfully installed langchain-text-splitters-1.1.0
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain
  Downloading langchain-1.2.10-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-core<2.0.0,>=1.0.1 (from langchain-community)
  Downloading langchain_core-1.2.12-

In [146]:
# DAY 1
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [147]:
loader = PyPDFLoader("../content/sample_data/assurance.pdf")
pdf = loader.load()

print(f"Document charg√© : {len(pdf)} pages trouv√©es.")

Document charg√© : 8 pages trouv√©es.


In [148]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,      # Taille cible de chaque morceau
    chunk_overlap = 150,    # Chevauchement
    separators = ["\n\n", "\n", " ", ""] # L'ordre de priorit√© pour couper
)

chunks = text_splitter.split_documents(pdf)

In [149]:
print(f"Nombre total de chunks pour {len(pdf)} pages : {len(chunks)}")

lengths = [len(c.page_content) for c in chunks]
import statistics
print(f"Taille moyenne : {statistics.mean(lengths)} caract√®res")

# Voir √† quelle page appartient le chunk n¬∞10
print(f"Le chunk 10 vient de la page : {chunks[10].metadata['page']}")

Nombre total de chunks pour 8 pages : 31
Taille moyenne : 803.7096774193549 caract√®res
Le chunk 10 vient de la page : 2


In [151]:
# DAY 2
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Cr√©ation de la base de donn√©es physique
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=hf_embeddings,
    persist_directory="./chroma_db1"
)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [152]:
question = "Quelles sont les garanties de mon contrat ?"
question_embedding = hf_embeddings.embed_documents([question])

# Demande les 'k' meilleurs r√©sultats
# Chroma fait l'embedding de la question et le calcul de similarit√©
docs = vectordb.similarity_search(question, k=1)

print(f"Question : {question}")
print("---")
print(f"R√©ponse: {docs[0].page_content}")

Question : Quelles sont les garanties de mon contrat ?
---
R√©ponse: Ce contrat est SANS TACITE reconduction, et √† renouveler chaque ann√©e avant le 1er septembre.
Il garantit l'assur√© en tant que locataire (ou occupant √† titre gratuit), de Appartement situ√©(e) √† l‚Äôadresse suivante :
2 route de Narbonne ‚Äì N¬∞ App: 167  - R√©sidence: R√©sidence Bordegrande ‚Äì Cplt adresse¬†:  ‚Äì Code postal et Ville¬†: 31320 AUZEVILLE-
TOLOSANE 
Il couvre les risques suivants¬†: 
- INCENDIE (sans franchise)
- DEGATS DES EAUX (sans franchise)
- PROTECTION JURIDIQUE
- INDEMNITE REDOUBLEMENT (garantie acquise uniquement pour les √©tudiants)
- INDIVIDUELLE ACCIDENT (garantie acquise uniquement pour les jeunes dipl√¥m√©s)
Pour toutes les questions relatives √† la gestion de votre contrat, √† vos d√©clarations de sinistre ou √† vos besoins d‚Äôassistance, vous pouvez
contacter le n¬∞ 03 20 33 09 33.
La compagnie atteste avoir pris connaissance du fait que les personnes assur√©es au contrat seront a

In [153]:
# DAY 3
from langchain_groq import ChatGroq
import os

llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    temperature=0.5
)

In [154]:
from langchain_classic.chains import RetrievalQA

# Transformer la base Chroma en "chercheur" (retriever)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Cr√©er la cha√Æne RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # "stuff" veut dire : "donne tout le texte au LLM"
    retriever=retriever
)

In [155]:
response = qa_chain.invoke("Quelles sont les garanties de mon contrat ?")
print(response["result"])

D'apr√®s le contrat que vous avez fourni, voici les garanties qui sont mentionn√©es :

1. **INCENDIE** : sans franchise
2. **DEGATS DES EAUX** : sans franchise
3. **PROTECTION JURIDIQUE**
4. **INDEMNITE REDOUBLEMENT** : garantie acquise uniquement pour les √©tudiants
5. **INDIVIDUELLE ACCIDENT** : garantie acquise uniquement pour les jeunes dipl√¥m√©s

Il est √©galement mentionn√© que le contrat garantit les dommages que pourraient subir les biens de l'assur√© du fait du mat√©riel confi√© par son employeur, en cas de t√©l√©travail.


In [None]:
# DAY 4

In [156]:
import gradio as gr
from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain

# 1. Configurer la m√©moire
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer'
)

In [157]:
# 2. Cr√©er la cha√Æne avec m√©moire
qa_chat = ConversationalRetrievalChain.from_llm(
    llm=llm, # LLM Groq
    retriever=retriever, # retriever Chroma
    memory=memory
)

In [158]:
# 3. Fonction pour l'interface Gradio
def chat_interactif(message, history):
    response = qa_chat.invoke({"question": message})
    return response["answer"]

In [159]:
# 4. Lancer l'interface
demo = gr.ChatInterface(
    fn=chat_interactif,
    title="üìö Mon Assistant PDF Intelligent",
    description="Posez des questions sur votre document, je m'en souviendrai !",
    examples=["Fais-moi un r√©sum√©", "Quels sont les points cl√©s ?"],
    theme="soft"
)

  self.chatbot = Chatbot(


In [160]:
demo.launch(share=True) # share=True cr√©e un lien public de 72h !

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e7666ad8c1ef28f09c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# DAY 5

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer',
    input_key='question'
)

memory.clear()
# D√©finir les instructions (Prompt)
instruction_prompt = ChatPromptTemplate.from_messages([
    ("system", (
        "DOCUMENTS DE R√âF√âRENCE :\n"
        "{context}\n\n"
        "CONSIGNE:\n"
        "1. Ne r√©ponds QUE si la r√©ponse est √©crite dans les 'DOCUMENTS DE R√âF√âRENCE' ci-dessus.\n"
        "2. Si tu ne trouves pas la r√©ponse, r√©ponds 'Je ne sais pas, cette information n'est pas dans le document."
        ". Sinon, donne la r√©ponse DIRECTEMENT, sans expliquer tes consignes ni tes limites."
        "3. INTERDICTION d'utiliser tes propres connaissances.\n"
    )),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
])

qa_chat = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": instruction_prompt},
    # forcer l'historique √† rester une liste
    get_chat_history=lambda h: h
)

In [None]:
def chat_interactif(message, history):
    # Appeler la cha√Æne pour obtenir le r√©sultat complet
    result = qa_chat.invoke({"question": message})

    # Extraire le texte de la r√©ponse
    reponse_texte = result.get("answer", "Pas de r√©ponse trouv√©e.")

    if "ne sais pas" in reponse_texte.lower():
        return reponse_texte

    # Extraire les documents sources
    sources = result.get("source_documents", [])

    # V√©rifier la pr√©sence de sources pour extraire les pages
    if sources:
        # R√©cup√©rer les num√©ros de pages uniques et les trier
        pages = sorted(list(set([str(int(d.metadata.get("page", 0)) + 1) for d in sources])))

        # Formater et ajouter les pages √† la fin du texte
        suffixe_source = f"\n\n(Source : Page {', '.join(pages)})"
        reponse_texte += suffixe_source

    return reponse_texte

In [None]:
#chat_interactif("fais moi r√©sume", [])

In [None]:
demo = gr.ChatInterface(
    fn=chat_interactif,
    title="üìö Assistant PDF Pro",
    description="Analyse de documents avec citations de sources",
    examples=["Fais-moi un r√©sum√©", "Quels sont les points cl√©s ?"],
    theme="soft"
)
demo.launch(share=True)

  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d6b2094b44060d2ec4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# DAY 6

In [127]:
from langchain_groq import ChatGroq
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import shutil
import gradio as gr
from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import re

In [128]:


llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    temperature=0.5
)

In [129]:
instruction_prompt = ChatPromptTemplate.from_messages([
    ("system", (
        "DOCUMENTS DE R√âF√âRENCE :\n"
        "{context}\n\n"
        "CONSIGNE:\n"
        "1. Ne r√©ponds QUE si la r√©ponse est √©crite dans les 'DOCUMENTS DE R√âF√âRENCE' ci-dessus.\n"
        "2. Si tu ne trouves pas la r√©ponse, r√©ponds 'Je ne sais pas, cette information n'est pas dans le document."
        ". Sinon, donne la r√©ponse DIRECTEMENT, sans expliquer tes consignes ni tes limites."
        "3. INTERDICTION d'utiliser tes propres connaissances.\n"
    )),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
])

In [171]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer',
    input_key='question'
)

memory.clear()
# D√©finir les instructions (Prompt)
instruction_prompt = ChatPromptTemplate.from_messages([
    ("system", (
        "Tu es un expert en analyse de documents. Ton r√¥le est de fournir des r√©sum√©s clairs, "
        #"pr√©cis et structur√©s √† partir du contexte fourni ci-dessous.\n\n"
        #"CONTEXTE :\n{context}\n\n"
        #"DIRECTIVES :\n"
        #"1. Agis comme un analyste senior : synth√©tise les points cl√©s, les noms, les dates et les montants importants.\n"
        #"2. Si l'information demand√©e n'est pas pr√©sente dans le contexte, dis simplement que tu ne sais pas. N'invente jamais d'informations.\n"
        #"3. Ignore les √©l√©ments r√©p√©titifs de mise en page (comme les adresses de pied de page ou les mentions l√©gales r√©currentes) pour te concentrer sur le fond du document.\n"
        #"4. R√©ponds de mani√®re professionnelle en utilisant des listes √† puces pour la clart√©."
    )),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
])

# Initialisation vide au d√©marrage
qa_chat = None
vectordb = None

In [172]:
def process_pdf(file):
    global qa_chat, vectordb

    qa_chat = None
    vectordb = None

    # Lib√©rer les fichiers sur le disque
    import gc
    gc.collect()

    base_name = os.path.basename(file.name)
    clean_name = re.sub(r'[^\w\s-]', '', base_name.replace(".pdf", "")).strip()
    db_path = f"./chroma_{clean_name}"

    # 1. Charger le document upload√©
    loader = PyPDFLoader(file.name)
    documents = loader.load()

    # 2. D√©couper en segments
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150, separators = ["\n\n", "\n", " ", ""])
    chunks = text_splitter.split_documents(documents)

    # 3. Cr√©er mod√®le embeddings
    hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # 4. Cr√©er la base de donn√©es en m√©moire (ou √©craser la pr√©c√©dente)
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=hf_embeddings,
        persist_directory=db_path
    )

    # 5. Mettre √† jour le retriever de ton qa_chat existant
    qa_chat = ConversationalRetrievalChain.from_llm(
        llm=llm,
        memory=memory,
        retriever = vectordb.as_retriever(search_kwargs={"k": 3}),
        combine_docs_chain_kwargs={"prompt": instruction_prompt},
        # forcer l'historique √† rester une liste
        get_chat_history=lambda h: h
    )

    return "PDF analys√© avec succ√®s !"

In [165]:
def chat_interactif(message, history):
    # Lancer la recherche
    result = qa_chat.invoke({"question": message, "chat_history": history})
    reponse_texte = result.get("answer", "")

    # Mettre √† jour l'historique
    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": reponse_texte})

    # Renvoyer les donn√©es
    return "", history

In [166]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü§ñ Mon Expert PDF Intelligent")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="T√©l√©verser votre PDF (Assurance, Contrat, etc.)")
            upload_button = gr.Button("Analyser le document", variant="primary")
            status_label = gr.Label(value="En attente de document...")

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(type="messages")
            msg = gr.Textbox(label="Posez votre question sur le document")
            clear = gr.ClearButton([msg, chatbot])

    # √âv√©nement pour l'upload
    upload_button.click(process_pdf, inputs=[file_input], outputs=[status_label])

    # √âv√©nement pour le chat (ta fonction chat_interactif actuelle)
    msg.submit(chat_interactif, [msg, chatbot], [msg, chatbot])


demo.launch()

  with gr.Blocks(theme=gr.themes.Soft()) as demo:
  chatbot = gr.Chatbot(type="messages")


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5aa6e477ade67234dd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [135]:
# DAY 7

In [167]:
def chat_interactif(message, history):
    # 1. S√©curit√©
    if qa_chat is None:
        history.append([message, "‚ö†Ô∏è Veuillez d'abord charger un PDF."])
        return "", history

    # 2. Lancer la recherche
    # Note : LangChain Conversational Chain attend souvent une liste de tuples pour history
    result = qa_chat.invoke({"question": message, "chat_history": history})
    reponse_texte = result.get("answer", "")

    # 3. Mettre √† jour l'historique au format GRADIO [[user, bot]]
    history.append([message, reponse_texte])

    # 4. Renvoyer le message vide (pour vider la barre de texte) et l'historique
    return "", history

In [174]:
gr.close_all()

Closing server running on port: 7863
Closing server running on port: 7860
Closing server running on port: 7874
Closing server running on port: 7861
Closing server running on port: 7862
Closing server running on port: 7874
Closing server running on port: 7876
Closing server running on port: 7872
Closing server running on port: 7875
Closing server running on port: 7876
Closing server running on port: 7868
Closing server running on port: 7877
Closing server running on port: 7865
Closing server running on port: 7874
Closing server running on port: 7875
Closing server running on port: 7864
Closing server running on port: 7873
Closing server running on port: 7866
Closing server running on port: 7870
Closing server running on port: 7871
Closing server running on port: 7867
Closing server running on port: 7875
Closing server running on port: 7877
Closing server running on port: 7869
Closing server running on port: 7877


In [None]:
# INTERFACE GRADIO
# CSS pour am√©liorer les bulles de chat
css = """
footer {visibility: hidden}
.gradio-container {background-color: #f7f9fc}
#chatbot-style {height: 500px !important; overflow-y: auto;}
"""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), css=css) as demo:
    gr.HTML("<center><h1>üìë Assistant PDF Intelligent</h1><p>Analyse experte de vos documents</p></center>")

    with gr.Row():
        # Colonne Gauche : Upload & Statut
        with gr.Column(scale=1):
            gr.Markdown("### üì• 1. Document")
            file_input = gr.File(label="D√©posez votre PDF", file_types=[".pdf"])
            status = gr.Textbox(label="Statut syst√®me", interactive=False)

            gr.Markdown("### ‚ö° 2. Actions Rapides")
            btn_sum = gr.Button("üìù R√©sumer le document", variant="secondary")
            btn_key = gr.Button("üîë Points Cl√©s", variant="secondary")
            btn_clr = gr.ClearButton(value="üóëÔ∏è Effacer l'historique") # Bouton effacer int√©gr√©


        # Colonne Droite : Chatbot
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                elem_id="chatbot-style",
                bubble_full_width=False,
                show_label=False,
                show_copy_button=True
            )
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Posez votre question ici...",
                    label=None,
                    scale=7,
                    container=False
                )
                submit_btn = gr.Button("Envoyer", variant="primary", scale=1)

    # --- ACTIONS ---

    # Association du bouton effacer
    btn_clr.add([msg, chatbot])

    # Upload d√©clenche l'analyse
    file_input.upload(fn=process_pdf, inputs=[file_input], outputs=[status])

    # Envoi du message (Entr√©e ou Bouton)
    msg.submit(fn=chat_interactif, inputs=[msg, chatbot], outputs=[msg, chatbot])
    submit_btn.click(fn=chat_interactif, inputs=[msg, chatbot], outputs=[msg, chatbot])

    # Actions Rapides
    btn_sum.click(lambda h: chat_interactif("Peux-tu me faire un r√©sum√© concis de ce document ?", h),
                  inputs=[chatbot], outputs=[msg, chatbot])

    btn_key.click(lambda h: chat_interactif("Quelles sont les dates importantes et les chiffres cl√©s ?", h),
                  inputs=[chatbot], outputs=[msg, chatbot])

# Lancement
demo.launch(debug=True, share=True)

  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), css=css) as demo:
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), css=css) as demo:
  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(
  btn_clr.add([msg, chatbot])


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://91fb9e9679e2921ce0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 416, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/fastapi/applications.py", line 1138, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/applications.py", line 107, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/error

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 2191, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 1698, 