In [None]:
!pip install -U langchain-text-splitters
!pip install -U langchain-community langchain-text-splitters langchain
!pip install pypdf
!pip install -U langchain-huggingface
!pip install -qU langchain-chroma
!pip install -qU langchain
!pip install -qU langchain-groq
!pip install sentence-transformers
!pip install -q gradio

In [None]:
# DAY 1
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("../content/sample_data/assurance.pdf")
pdf = loader.load()

print(f"Document chargÃ© : {len(pdf)} pages trouvÃ©es.")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,      # Taille cible de chaque morceau
    chunk_overlap = 150,    # Chevauchement
    separators = ["\n\n", "\n", " ", ""] # L'ordre de prioritÃ© pour couper
)

chunks = text_splitter.split_documents(pdf)

In [None]:
print(f"Nombre total de chunks pour {len(pdf)} pages : {len(chunks)}")

lengths = [len(c.page_content) for c in chunks]
import statistics
print(f"Taille moyenne : {statistics.mean(lengths)} caractÃ¨res")

# Voir Ã  quelle page appartient le chunk nÂ°10
print(f"Le chunk 10 vient de la page : {chunks[10].metadata['page']}")

In [None]:
# DAY 2
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# CrÃ©ation de la base de donnÃ©es physique
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=hf_embeddings,
    persist_directory="./chroma_db"
)

In [None]:
question = "Quelles sont les garanties de mon contrat ?"
question_embedding = hf_embeddings.embed_documents([question])

# Demande les 'k' meilleurs rÃ©sultats
# Chroma fait l'embedding de la question et le calcul de similaritÃ©
docs = vectordb.similarity_search(question, k=1)

print(f"Question : {question}")
print("---")
print(f"RÃ©ponse: {docs[0].page_content}")

In [None]:
# DAY 3
from langchain_groq import ChatGroq
import os


llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    temperature=0.2
)

In [None]:
from langchain_classic.chains import RetrievalQA

# Transformer la base Chroma en "chercheur" (retriever)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# CrÃ©er la chaÃ®ne RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # "stuff" veut dire : "donne tout le texte au LLM"
    retriever=retriever
)

In [None]:
response = qa_chain.invoke("Quelles sont les garanties de mon contrat ?")
print(response["result"])

In [None]:
# DAY 4

In [None]:
import gradio as gr
from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain

# 1. Configurer la mÃ©moire
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
# 2. CrÃ©er la chaÃ®ne avec mÃ©moire
qa_chat = ConversationalRetrievalChain.from_llm(
    llm=llm, # LLM Groq
    retriever=retriever, # retriever Chroma
    memory=memory
)

In [None]:
# 3. Fonction pour l'interface Gradio
def chat_interactif(message, history):
    response = qa_chat.invoke({"question": message})
    return response["answer"]

In [None]:
# 4. Lancer l'interface
demo = gr.ChatInterface(
    fn=chat_interactif,
    title="ðŸ“š Mon Assistant PDF Intelligent",
    description="Posez des questions sur votre document, je m'en souviendrai !",
    examples=["Fais-moi un rÃ©sumÃ©", "Quels sont les points clÃ©s ?"],
    theme="soft"
)

In [None]:
demo.launch(share=True) # share=True crÃ©e un lien public de 72h !