In [1]:
#inputs
file_path = "data/Code_penal_small.pdf"


In [2]:
#Loading documents
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))


2


In [3]:
#Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

3

In [4]:
#Embeddings api creds
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

### import db and only add to db if not exists

In [5]:
#Hashing chunks
import hashlib
def get_id(split):
    text = str(split)
    hash_md5 = hashlib.md5(text.encode()).hexdigest()
    return(hash_md5)

In [6]:
#initialize db
from langchain_chroma import Chroma
#input
db_path = "./chroma_langchain_db"
collection_name = 'chunk_test'
#body
vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings,
    persist_directory=db_path,  # Where to save data locally, remove if not necessary
)

In [7]:
#import db from local
import chromadb

# # Connexion à ChromaDB (par défaut en local)
# client = chromadb.Client()

# Ou pour une base persistante
client = chromadb.PersistentClient(path=db_path)

# Récupérer ou créer une collection
collection = client.get_or_create_collection(collection_name)

In [8]:
def add_to_db(collection, list_of_documents):
    all_data = collection.get()
    existing_ids = set(all_data['ids'])
    for e in list_of_documents:
        e_id = get_id(e)
        if e_id in existing_ids:
            print("Cet ID existe déjà .... skipping")
        else:
            # Extraire le contenu texte ET les métadonnées
            collection.add(
                documents=[e.page_content],  # Le texte
                ids=[e_id],
                metadatas=[e.metadata]       # Les métadonnées (optionnel)
            )    

In [9]:
# Supprimer l'ancienne collection
client.delete_collection(collection_name)

# Recréer avec le bon modèle d'embedding
collection = client.get_or_create_collection(collection_name)

In [10]:
#usage add to db a list of files
add_to_db(collection=collection, list_of_documents=all_splits)

In [11]:
#dump into a pickle
import pickle

# Récupérer toutes les données de la collection
all_data = collection.get(include=['documents', 'metadatas', 'embeddings'])

# Sauver en pickle
with open('ma_db.pkl', 'wb') as f:
    pickle.dump(all_data, f)

In [12]:
import pickle
import chromadb

def merge_pickles_to_db(pickle_files, output_collection):
    """Merge plusieurs pickles dans une collection ChromaDB"""
    
    all_docs = []
    all_ids = []
    all_metadatas = []
    all_embeddings = []
    
    for pickle_file in pickle_files:
        print(f"Chargement de {pickle_file}...")
        
        with open(pickle_file, 'rb') as f:
            data = pickle.load(f)
        
        # Combiner les données
        all_docs.extend(data['documents'])
        all_ids.extend(data['ids'])
        all_metadatas.extend(data['metadatas'] or [None] * len(data['documents']))
        
        if data['embeddings']:
            all_embeddings.extend(data['embeddings'])
    
    # Ajouter tout à la collection finale
    if all_embeddings:
        output_collection.add(
            documents=all_docs,
            ids=all_ids,
            metadatas=all_metadatas,
            embeddings=all_embeddings
        )
    else:
        output_collection.add(
            documents=all_docs,
            ids=all_ids,
            metadatas=all_metadatas
        )
    
    print(f"Mergé {len(all_docs)} documents!")


In [13]:

# Usage
pickle_files = ['db1.pkl', 'db2.pkl', 'db3.pkl']
client = chromadb.PersistentClient(path="merged_db")
merged_collection = client.get_or_create_collection("merged_collection")

merge_pickles_to_db(pickle_files, merged_collection)

Chargement de db1.pkl...


FileNotFoundError: [Errno 2] No such file or directory: 'db1.pkl'