Importer les librairies dans le notebook

In [32]:
pip install ollama

Collecting ollama
  Using cached ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting httpx>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting anyio (from httpx>=0.27->ollama)
  Downloading anyio-4.12.0-py3-none-any.whl.metadata (4.3 kB)
Collecting httpcore==1.* (from httpx>=0.27->ollama)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx>=0.27->ollama)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Using cached ollama-0.6.1-py3-none-any.whl (14 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Using cached httpcore-1.0.9-py3-none-any.whl (78 kB)
Downloading anyio-4.12.0-py3-none-any.whl (113 kB)
   ---------------------------------------- 0.0/113.4 kB ? eta -:--:--
   --- ------------------------------------ 10.2/113.4 kB ? eta -:--:--
   ---------- ---------------------------- 30.7/113.4 kB 660.6 kB/s eta 0:00:01
   ------------------------ -------------- 7


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [59]:
import ollama
import psycopg
from psycopg import Cursor
import os
import glob

Configuration

In [60]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), "data")

OLLAMA_MODEL = "mistral:latest"

db_connection_str = "postgresql://postgres:postgres@localhost:5433/ai"

print(f"‚úì Dossier de donn√©es : {data_dir}")
print(f"‚úì Mod√®le Ollama : {OLLAMA_MODEL}")
print(f"‚úì Configuration charg√©e")


‚úì Dossier de donn√©es : c:\Users\ahmwa\OneDrive\Bureau\Projets\AI\Chatbot\data
‚úì Mod√®le Ollama : mistral:latest
‚úì Configuration charg√©e


Test de connexion Ollama

In [61]:
try:
    models = ollama.list()
    
    # Extraire correctement tous les noms disponibles
    if isinstance(models, dict) and 'models' in models:
        model_names = [m.get('name') or m.get('model') for m in models['models']]
    elif isinstance(models, list):
        model_names = [m.get('name') or m.get('model') for m in models]
    else:
        model_names = []

    if OLLAMA_MODEL in model_names:
        print(f"‚úì Mod√®le {OLLAMA_MODEL} disponible")
    else:
        print(f"‚ö†Ô∏è  Mod√®le {OLLAMA_MODEL} non trouv√©")
        print("   Mod√®les disponibles :", ", ".join(model_names))

    # Test embeddings
    print("\nüß™ Test d'embedding...")
    test_response = ollama.embeddings(model=OLLAMA_MODEL, prompt="test")
    print(f"‚úì Ollama fonctionne correctement")

except Exception as e:
    print(f"‚ùå Erreur de connexion √† Ollama : {e}")


‚ö†Ô∏è  Mod√®le mistral:latest non trouv√©
   Mod√®les disponibles : 

üß™ Test d'embedding...
‚úì Ollama fonctionne correctement


Fonctions utilitaires

In [None]:
def create_conversation_list(file_path: str) -> list[str]:
    """Lit le fichier avec le bon encodage et filtre les lignes"""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
    except UnicodeDecodeError:
        print(f"‚ö†Ô∏è  Encodage UTF-8 invalide pour {file_path} ‚Äî r√©essayage avec cp1252")
        with open(file_path, "r", encoding="cp1252", errors="replace") as file:
            text = file.read()

    text_list = text.split("\n")
    filtered_list = [
        chaine.removeprefix("     ")
        for chaine in text_list
        if not chaine.startswith("<") and chaine.strip()
    ]
    print(f"‚úì {len(filtered_list)} lignes extraites")
    return filtered_list


def calculate_embeddings(corpus: str) -> list[float]:
    """Calcule les embeddings avec Ollama"""
    if not corpus or not corpus.strip():
        raise ValueError("Le corpus ne peut pas √™tre vide")
    
    # Appel √† Ollama pour g√©n√©rer l'embedding
    response = ollama.embeddings(
        model=OLLAMA_MODEL,
        prompt=corpus
    )
    return response["embedding"]


def save_embedding(corpus: str, embedding: list[float], cursor: Cursor) -> None:
    """Sauvegarde le corpus et son embedding"""
    cursor.execute(
        '''INSERT INTO embeddings (corpus, embedding) VALUES (%s, %s)''',
        (corpus, embedding)
    )


def similar_corpus(input_corpus: str, db_connection_str: str, top_k: int = 5) -> list[tuple]:
    """
    Recherche les textes similaires dans la base de donn√©es
    """
    query_embedding = calculate_embeddings(input_corpus)
    
    with psycopg.connect(db_connection_str) as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, corpus, 
                       1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (query_embedding, query_embedding, top_k))
            
            return cur.fetchall()

print("‚úì Fonctions d√©finies")

‚úì Fonctions d√©finies


Cr√©ation de la base de donn√©es

In [63]:
print("=" * 70)
print("üöÄ CR√âATION DE LA BASE D'EMBEDDINGS")
print("=" * 70)

with psycopg.connect(db_connection_str) as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        # Supprimer la table si elle existe
        cur.execute("""DROP TABLE IF EXISTS embeddings""")
        print("‚úì Table existante supprim√©e")
        
        # Cr√©er l'extension pgvector
        cur.execute("""CREATE EXTENSION IF NOT EXISTS vector""")
        print("‚úì Extension pgvector cr√©√©e")
        
        # Cr√©er la table avec VECTOR(768)
        cur.execute("""
            CREATE TABLE IF NOT EXISTS embeddings (
                id SERIAL PRIMARY KEY, 
                corpus TEXT,
                embedding VECTOR(768)
            )
        """)
        print("‚úì Table embeddings cr√©√©e avec VECTOR(768)")
        
        # Cr√©er un index pour acc√©l√©rer les recherches
        cur.execute("""
            CREATE INDEX IF NOT EXISTS embeddings_embedding_idx 
            ON embeddings USING ivfflat (embedding vector_cosine_ops)
            WITH (lists = 100)
        """)
        print("‚úì Index de recherche cr√©√©")

üöÄ CR√âATION DE LA BASE D'EMBEDDINGS
‚úì Table existante supprim√©e
‚úì Extension pgvector cr√©√©e
‚úì Table embeddings cr√©√©e avec VECTOR(768)
‚úì Index de recherche cr√©√©


Chargement des fichiers

In [None]:
print("\n" + "=" * 70)
print("üìÇ CHARGEMENT DES DONN√âES")
print("=" * 70)

text_files = sorted(glob.glob(os.path.join(data_dir, "*.txt")))

if not text_files:
    print(f"‚ö†Ô∏è  Aucun fichier .txt trouv√© dans le dossier {data_dir}")
else:
    print(f"‚úì {len(text_files)} fichier(s) trouv√©(s)")
    for i, file in enumerate(text_files[:5], 1):  # Afficher les 5 premiers
        print(f"  {i}. {os.path.basename(file)}")
    if len(text_files) > 5:
        print(f"  ... et {len(text_files) - 5} autre(s)")


üìÇ CHARGEMENT DES DONN√âES
‚úì 41 fichier(s) trouv√©(s)
  1. 017_00000012.txt
  2. 018_00000013.txt
  3. 019_00000014.txt
  4. 020_00000015.txt
  5. 022_00000017.txt
  ... et 36 autre(s)


Traitement des embeddings

In [65]:
print("\n" + "=" * 70)
print("‚öôÔ∏è  TRAITEMENT DES EMBEDDINGS")
print("=" * 70)

success_count = 0
error_count = 0

with psycopg.connect(db_connection_str) as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        total_files = len(text_files)
        
        for file_idx, file_path in enumerate(text_files, 1):
            print(f"\nüî∏ Traitement du fichier [{file_idx}/{total_files}] : {os.path.basename(file_path)}")
            corpus_list = create_conversation_list(file_path=file_path)

            for i, corpus in enumerate(corpus_list, 1):
                try:
                    embedding = calculate_embeddings(corpus)
                    save_embedding(corpus=corpus, embedding=embedding, cursor=cur)
                    success_count += 1
                
                    # Afficher un aper√ßu
                    preview = corpus[:50] + "..." if len(corpus) > 50 else corpus
                    print(f"‚úì [{i}/{len(corpus_list)}] {preview}")
                
                except Exception as e:
                    error_count += 1
                    print(f"‚úó [{i}/{len(corpus_list)}] ERREUR: {e}")


‚öôÔ∏è  TRAITEMENT DES EMBEDDINGS

üî∏ Traitement du fichier [1/41] : 017_00000012.txt
‚ö†Ô∏è  Encodage UTF-8 invalide pour c:\Users\ahmwa\OneDrive\Bureau\Projets\AI\Chatbot\data\017_00000012.txt ‚Äî r√©essayage avec cp1252
‚úì 43 lignes extraites
‚úó [1/43] ERREUR: expected 768 dimensions, not 4096
‚úó [2/43] ERREUR: expected 768 dimensions, not 4096
‚úó [3/43] ERREUR: expected 768 dimensions, not 4096
‚úó [4/43] ERREUR: expected 768 dimensions, not 4096
‚úó [5/43] ERREUR: expected 768 dimensions, not 4096
‚úó [6/43] ERREUR: expected 768 dimensions, not 4096
‚úó [7/43] ERREUR: expected 768 dimensions, not 4096
‚úó [8/43] ERREUR: expected 768 dimensions, not 4096
‚úó [9/43] ERREUR: expected 768 dimensions, not 4096
‚úó [10/43] ERREUR: expected 768 dimensions, not 4096
‚úó [11/43] ERREUR: expected 768 dimensions, not 4096
‚úó [12/43] ERREUR: expected 768 dimensions, not 4096
‚úó [13/43] ERREUR: expected 768 dimensions, not 4096
‚úó [14/43] ERREUR: expected 768 dimensions, not 4096
‚úó

KeyboardInterrupt: 