In [4]:
# rag.py file

from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()

# Establish connection to PostgreSQL database using environment variables
conn = psycopg2.connect(
    database=os.getenv("PGDATABASE"),
    user=os.getenv("PGUSER"),
    password=os.getenv("PGPASSWORD"),
    host=os.getenv("PGHOST"),
    port=os.getenv("PGPORT"),
)


# Create a cursor to execute SQL commands
cur = conn.cursor()

In [5]:
try:
    with conn.cursor() as cur:
        cur.execute("SELECT 1")
    print({"status": "healthy", "database": "connected"})
except Exception as e:
    print({"status": "unhealthy", "database": str(e)})


{'status': 'healthy', 'database': 'connected'}


In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("SCW_SECRET_KEY"),
    openai_api_base=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
    model="sentence-transformers/sentence-t5-xxl",
    tiktoken_enabled=False,
)

In [7]:
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

In [8]:
connection_string = f"postgresql+psycopg2://{conn.info.user}:{conn.info.password}@{conn.info.host}:{conn.info.port}/{conn.info.dbname}"
vector_store = PGVector(connection=connection_string, embeddings=embeddings)

vector_db = Chroma(...)
docs = vector_db.get()
documents = docs["documents"]

vector_retriever = vector_db.as_retriever(...)
keyword_retriever = BM25Retriever.from_texts(documents)
ensemble_retriever = EnsembleRetriever(retrievers=[keyword_retriever, vector_retriever], ...)

In details:

Import the libraries:

from langchain_chroma import Chroma # Langchain
import chromadb # Chroma

Instantiate the Chroma vector DB:

chroma_client = chromadb.HttpClient(host=CHROMA_SERVER_HOST, port=CHROMA_SERVER_PORT)
vector_db = Chroma(embedding_function=embedding_model, collection_name=COLLECTION_NAME, client=chroma_client)
docs = vector_db.get()
documents = docs["documents"]

RAG hybrid search (same code for Chroma or PostgreSQL):

vector_retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": VECTORDB_MAX_RESULTS}) # Semantic search
keyword_retriever = BM25Retriever.from_texts(documents) # Keyword search
ensemble_retriever = EnsembleRetriever(retrievers=[keyword_retriever, vector_retriever], weights=[0.5, 0.5]) # Combining the two searches

In [9]:
# rag.py

import boto3
from langchain_community.document_loaders import S3FileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [10]:
# rag.py

session = boto3.session.Session()
client_s3 = session.client(
    service_name="s3",
    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""),
)
paginator = client_s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=os.getenv("SCW_BUCKET_NAME", ""))

In [11]:
def create_context_prompt(document_content, chunk_text):
    """
    Creates a well-structured prompt for context generation.
    """

    prompt_template = """Here is the chunk we want to situate within the whole document 
<document>
{document}
</document>

<chunk_to_analyze>
{chunk}
</chunk_to_analyze>
Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else. 
."""

    return prompt_template.format(
        document=document_content.strip(), chunk=chunk_text.strip()
    )

In [20]:
page_iterator

<botocore.paginate.PageIterator at 0x122ad1a50>

In [23]:
# rag.py
from openai import OpenAI
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the client with your base URL and API key
client = OpenAI(
    base_url="https://api.scaleway.ai/v1", api_key=os.getenv("SCW_SECRET_KEY", "")
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300,
    add_start_index=True,
    length_function=len,
    is_separator_regex=False,
)

for page in page_iterator:
    for obj in page.get("Contents", []):
        if obj["Key"] == "arrivee_scw/bienvenue.txt":
            # Check if cursor is closed and reopen if necessary
            if cur.closed:
                cur = conn.cursor()

            cur.execute(
                "SELECT object_key FROM object_loaded WHERE object_key = %s",
                (obj["Key"],),
            )
            print(obj["Key"])
            response = cur.fetchone()
            if response is None:
                file_loader = S3FileLoader(
                    bucket=os.getenv("SCW_BUCKET_NAME", ""),
                    key=obj["Key"],
                    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""),
                )
                file_to_load = file_loader.load()
                doc = file_to_load[0].page_content
                chunks = text_splitter.split_text(doc)
                context_chunks = []

                doc = "Page title: " + obj["Key"] + "\n" + doc

                for chunk in chunks:
                    completion = client.chat.completions.create(
                        model="llama-3.1-8b-instruct",
                        messages=[
                            {
                                "role": "user",
                                "content": create_context_prompt(
                                    document_content=doc, chunk_text=chunk
                                ),
                            }
                        ],
                        temperature=0.1,
                        max_tokens=100,
                    )
                    print(completion.choices[0].message.content)
                    context_chunks.append(completion.choices[0].message.content + chunk)

                try:
                    embeddings_list = [
                        embeddings.embed_query(chunk) for chunk in context_chunks
                    ]
                    vector_store.add_embeddings(chunks, embeddings_list)
                    cur.execute(
                        "INSERT INTO object_loaded (object_key) VALUES (%s)",
                        (obj["Key"],),
                    )
                except Exception as e:
                    print(f"An error occurred: {e}")

conn.commit()

arrivee_scw/bienvenue.txt
This chunk is situated within the "Bienvenue" section of the document, specifically within the subsection titled "# Programme d'onboarding", which outlines the onboarding process for new employees at Scaleway.
This chunk appears to be a section of the onboarding process for new employees at Scaleway, specifically detailing the first day of orientation and providing an overview of the company's HR department and resources.
This chunk appears to be a section within the "Bienvenue" document, specifically under the "Qui sommes nous?" and "Notre architecture humaine" sections, providing an overview of Scaleway's internal structure, culture, and benefits for employees.
This chunk appears to be part of the "Qui sommes nous?" section, which is a subsection of the "Scaleway" introduction, likely located near the beginning of the document.
This chunk describes the organizational structure and human resources management system of Scaleway, including its departments, lead