In [2]:
# rag.py file

from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()

# Establish connection to PostgreSQL database using environment variables
conn = psycopg2.connect(
    database=os.getenv("PGDATABASE"),
    user=os.getenv("PGUSER"),
    password=os.getenv("PGPASSWORD"),
    host=os.getenv("PGHOST"),
    port=os.getenv("PGPORT"),
)


# Create a cursor to execute SQL commands
cur = conn.cursor()

In [3]:
try:
    with conn.cursor() as cur:
        cur.execute("SELECT 1")
    print({"status": "healthy", "database": "connected"})
except Exception as e:
    print({"status": "unhealthy", "database": str(e)})


{'status': 'healthy', 'database': 'connected'}


In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("SCW_SECRET_KEY"),
    openai_api_base=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
    model="sentence-transformers/sentence-t5-xxl",
    tiktoken_enabled=False,
)

In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

In [6]:
connection_string = f"postgresql+psycopg2://{conn.info.user}:{conn.info.password}@{conn.info.host}:{conn.info.port}/{conn.info.dbname}"
vector_store = PGVector(connection=connection_string, embeddings=embeddings)

In [7]:
# rag.py

import boto3
from langchain_community.document_loaders import S3FileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [8]:
# rag.py

session = boto3.session.Session()
client_s3 = session.client(
    service_name="s3",
    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""),
)
paginator = client_s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=os.getenv("SCW_BUCKET_NAME", ""))

In [9]:
def create_context_prompt(document_content, chunk_text):
    """
    Creates a well-structured prompt for context generation.
    """

    prompt_template = """Here is the chunk we want to situate within the whole document 
<document>
{document}
</document>

<chunk_to_analyze>
{chunk}
</chunk_to_analyze>
Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else. 
."""

    return prompt_template.format(
        document=document_content.strip(), chunk=chunk_text.strip()
    )

In [10]:
page_iterator

<botocore.paginate.PageIterator at 0x15735c6d0>

In [48]:
# rag.py
from openai import OpenAI
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datetime import datetime
import json

# Initialize the client with your base URL and API key
client = OpenAI(
    base_url="https://api.scaleway.ai/v1", api_key=os.getenv("SCW_SECRET_KEY", "")
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300,
    add_start_index=True,
    length_function=len,
    is_separator_regex=False,
)

for page in page_iterator:
    for obj in page.get("Contents", []):
        if obj["Key"] == "arrivee_scw/bienvenue.txt":
            # Check if cursor is closed and reopen if necessary
            if cur.closed:
                cur = conn.cursor()

            cur.execute(
                "SELECT object_key FROM object_loaded WHERE object_key = %s",
                (obj["Key"],),
            )
            print(obj["Key"])
            response = cur.fetchone()
            if response is None:
                file_loader = S3FileLoader(
                    bucket=os.getenv("SCW_BUCKET_NAME", ""),
                    key=obj["Key"],
                    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""),
                )
                file_to_load = file_loader.load()
                doc = file_to_load[0].page_content
                chunks = text_splitter.split_text(doc)
                context_chunks = []

                doc = "Page title: " + obj["Key"] + "\n" + doc

                for chunk in chunks:
                    """ completion = client.chat.completions.create(
                        model="llama-3.1-8b-instruct",
                        messages=[
                            {
                                "role": "user",
                                "content": create_context_prompt(
                                    document_content=doc, chunk_text=chunk
                                ),
                            }
                        ],
                        temperature=0.1,
                        max_tokens=100,
                    ) 
                    print(completion.choices[0].message.content)
                    context_chunks.append(completion.choices[0].message.content + chunk)"""
                    context_chunks.append(chunk)

                try:
                    metadata_list = [
                        {
                            "chunk_id": idx,
                            "source": obj["Key"],
                            "timestamp": datetime.now().isoformat(),
                            "chunk_size": len(chunk),
                            "url": "https://confluence.infra.online.net/pages/viewpage.action?pageId=232395837",
                            # Add any other metadata fields you need
                            "position": idx * len(chunk)
                        }
                        for idx, chunk in enumerate(context_chunks)
                    ]
                    embeddings_list = [
                        embeddings.embed_query(chunk) for chunk in context_chunks
                    ]
                    vector_store.add_embeddings(chunks,embeddings=embeddings_list, metadatas=metadata_list)
                    cur.execute(
                        "INSERT INTO object_loaded (object_key, metadata) VALUES (%s, %s)",
                        (obj["Key"],json.dumps(metadata_list)),
                    )
                except Exception as e:
                    print(f"An error occurred: {e}")

conn.commit()

arrivee_scw/bienvenue.txt


In [12]:
from typing import List
from typing_extensions import Annotated, TypedDict

class AnswerWithSources(TypedDict):
    """An answer to the question, with sources."""

    answer: str
    sources: Annotated[
        List[str],
        ...,
        "List of sources (author + year) used to answer the question",
    ]



In [134]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_openai import OpenAIEmbeddings
#rag.py

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


doc_list_1 = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    """🎨 Branding resources 
Ultraviolet provides with all the resources, guidelines, elements, and principles that are needed to create assets and communicate around our brand. It ensures consistency in how we present ourselves to the world and supports effective communication across our teams!


  [Ultraviolet](https://ultraviolet.scaleway.com/6dd9b5c45/p/425c81-overview)

Nos valeurs 
Singularité

Nous sommes tous des êtres singuliers, nos équipes le sont aussi. Cette richesse qui fait notre force alimente notre aventure et façonne notre histoire."""
]

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
)
bm25_retriever.k = 2


#rag.py

llm = ChatOpenAI(
        base_url=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
        api_key=os.getenv("SCW_SECRET_KEY"),
        model="llama-3.1-8b-instruct",
        )

prompt = hub.pull("rlm/rag-prompt")


@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vector_store.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs
# retriever = vector_store.as_retriever()

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5]
)
ensemble_retriever = retriever
rag_chain = (
        {"context": ensemble_retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

for r in rag_chain.stream("Quelle est le lien de Ultraviolet ?"):
    print(r, end="", flush=True)




Je ne sais pas quel lien est associé à "Ultraviolet".

In [135]:
import time

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)


rag_chain_with_source = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)



In [139]:
rag_chain_with_source.invoke("Quelle est le programme du premier jour ?")["answer"]

"Le programme du premier jour est le suivant : Petit-déjeuner à 9h00, présentation par l'équipe IT Help Desk à 10h00, pause à 10h55, présentation de Scaleway et DRH à 11h00, tour des locaux et déjeuner avec ton manager à 12h00, et enfin présentation par l'équipe Product Management, Design et Product Documentation à 14h00."

In [137]:

for chunk in rag_chain_with_source.stream("Quelle est le programme du premier jour ?"):
    if "answer" in chunk.keys() : 
        print (chunk["answer"], end="")

    if "context" in chunk.keys() : 
        for e in chunk["context"] :
            print ("metadata : ", e.metadata)

    time.sleep(0.02)

metadata :  {'url': 'https://confluence.infra.online.net/pages/viewpage.action?pageId=232395837', 'source': 'arrivee_scw/bienvenue.txt', 'chunk_id': 1, 'position': 996, 'timestamp': '2024-10-24T11:06:04.504623', 'chunk_size': 996, 'score': 0.16478410854292203}
metadata :  {'url': 'https://confluence.infra.online.net/pages/viewpage.action?pageId=232395837', 'source': 'arrivee_scw/bienvenue.txt', 'chunk_id': 1, 'position': 996, 'timestamp': '2024-10-24T11:01:39.043720', 'chunk_size': 996, 'score': 0.16478410854292203}
metadata :  {'url': 'https://confluence.infra.online.net/pages/viewpage.action?pageId=232395837', 'source': 'arrivee_scw/bienvenue.txt', 'chunk_id': 1, 'position': 996, 'timestamp': '2024-10-24T11:00:13.865363', 'chunk_size': 996, 'score': 0.16478410854292203}
metadata :  {'score': 0.18299622434355167}
Le programme du premier jour, détaillé dans la section "Ton premier jour", comprend notamment un petit-déjeuner à 9h00, une présentation par l'équipe IT Help Desk à 10h00, un

# Metadata fetching 