In [None]:
# Installation des packages
import os
import sys
! pip install langchain 
! pip install pypdf
!pip install sentence-transformers
!pip install chromadb
!{sys.executable} -m pip install --upgrade pip setuptools wheel
!{sys.executable} -m pip install --disable-pip-version-check torch torchdata
!{sys.executable} -m pip install transformers==4.27.2 
!pip install -U datasets==2.14.6
!pip install fsspec==2023.9.2
!pip install lark

In [None]:
# Importation
from langchain.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
import torch
from datasets import load_dataset
import transformers
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
from transformers import pipeline

In [None]:
# Partie split

In [None]:
loader = TextLoader("bdc.txt")
pages_txt=loader.load()

In [None]:
headers_to_split_on = [
    ("###", "Header 1"),
    ("##", "Header 2"),
    ("#", "Header 3"),
]

In [None]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [None]:
md_header_splits = markdown_splitter.split_text(pages_txt[0].page_content)

In [None]:
print(len(md_header_splits))

In [None]:
for i in range(len(md_header_splits)):
    print(f'Contenu n° {i} : {len(md_header_splits[i].page_content)}')

In [None]:
# Partie embedding

In [None]:
embeddings = HuggingFaceEmbeddings()

In [None]:
persist_directory = 'chroma/'

In [None]:
# Vider le dossier à la main ou utiliser cette ligne si besoin
!rm -rf ./docs/chroma  # remove old database files if any

In [None]:
vectordb = Chroma.from_documents(
    documents=md_header_splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "Je n'arrive pas à obtenir mon avis de situation"

In [None]:
docs = vectordb.similarity_search(question,k=5)

In [None]:
# Let's save this so we can use it later!
vectordb.persist()

In [None]:
# Utilisation de MMR : équilibre entre pertinence et diversité des documents retrouvés
docs_mmr = vectordb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [None]:
# Utiliser automatiquement les infos présentes dans les metadata : 
# we can use `SelfQueryRetriever`, which uses an LLM to extract:
# 1. The `query` string to use for vector search
# 2. A metadata filter to pass in as well

In [None]:
# Important de bien remplir "description" vu que ce sera lu par le LLM
metadata_field_info = [
    AttributeInfo(
        name="Header 1",
        description="Le thème général auquel la question se rattache",
        type="string",
    ),
    AttributeInfo(
        name="Header 2",
        description="La catégorie au sein du thème général",
        type="string",
    ),
    AttributeInfo(
        name="Header 3",
        description="La sous-catégorie à laquelle la question est rattachée",
        type="string",
    ),
]

In [None]:
# model='gpt2' --> dépassement de max tokens
from langchain import HuggingFacePipeline
document_content_description = "Questions les plus fréquentes"
llm = HuggingFacePipeline(pipeline=pipeline('text-generation',model='databricks/dolly-v2-3b',torch_dtype=torch.bfloat16,\
                                            max_length=2000,trust_remote_code=True))
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True,
    handle_parsing_errors=True
)

In [None]:
question = "Je n'arrive pas à obtenir ma fiche Insee"

In [None]:
docs = retriever.get_relevant_documents(question)