In [None]:
!pip3 install langchain_postgres langchain psycopg2 python-dotenv langchainhub boto3 unstructured transformers


In [29]:
# rag.py file

from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()

# Establish connection to PostgreSQL database using environment variables
conn = psycopg2.connect(
       database=os.getenv("SCW_DB_NAME"),
       user=os.getenv("SCW_DB_USER"),
       password=os.getenv("SCW_DB_PASSWORD"),
       host=os.getenv("SCW_DB_HOST"),
       port=os.getenv("SCW_DB_PORT")
   )

# Create a cursor to execute SQL commands
cur = conn.cursor()

In [30]:
# rag.py

from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

In [31]:
# rag.py

embeddings = OpenAIEmbeddings(
                openai_api_key=os.getenv("SCW_SECRET_KEY"),
                openai_api_base=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
                model="sentence-transformers/sentence-t5-xxl",
                tiktoken_enabled=False
            )

In [32]:
# rag.py

connection_string = f"postgresql+psycopg2://{conn.info.user}:{conn.info.password}@{conn.info.host}:{conn.info.port}/{conn.info.dbname}"
vector_store = PGVector(connection=connection_string, embeddings=embeddings)

In [33]:
#rag.py

import boto3
from langchain_community.document_loaders import S3FileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [34]:
# rag.py

session = boto3.session.Session()
client_s3 = session.client(service_name='s3', endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                               aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                               aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""))
paginator = client_s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=os.getenv("SCW_BUCKET_NAME", ""))

In [35]:
# rag.py

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, add_start_index=True, length_function=len, is_separator_regex=False)
for page in page_iterator:
    for obj in page.get('Contents', []):
        print (obj['Key'])
        cur.execute("SELECT object_key FROM object_loaded WHERE object_key = %s", (obj['Key'],))
        response = cur.fetchone()
        if response is None:

            file_loader = S3FileLoader(
                    bucket=os.getenv("SCW_BUCKET_NAME", ""),
                    key=obj['Key'],
                    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", "")
                )
            file_to_load = file_loader.load()
            cur.execute("INSERT INTO object_loaded (object_key) VALUES (%s)", (obj['Key'],))
            chunks = text_splitter.split_text(file_to_load[0].page_content)
            try:
                embeddings_list = [embeddings.embed_query(chunk) for chunk in chunks]
                vector_store.add_embeddings(chunks, embeddings_list)
                cur.execute("INSERT INTO object_loaded (object_key) VALUES (%s)", (obj['Key'],))
            except Exception as e:
                print(f"An error occurred: {e}")

conn.commit()

bienvenue.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test.txt


In [36]:
#rag.py

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


In [43]:
#rag.py

llm = ChatOpenAI(
        base_url=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
        api_key=os.getenv("SCW_SECRET_KEY"),
        model="llama-3.1-8b-instruct",
        )

prompt = hub.pull("rlm/rag-prompt")
retriever = vector_store.as_retriever()


rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

for r in rag_chain.stream("Qui es tu ?"):
    print(r, end="", flush=True)



Je ne vois pas clairement dans le texte qui tu es. Cependant, il y a un document qui mentionne "Qui sommes nous?" mais sans donner plus d'informations sur la question.

In [None]:

connection_string = f"postgresql+psycopg2://{conn.info.user}:{conn.info.password}@{conn.info.host}:{conn.info.port}/{conn.info.dbname}"
vector_store = PGVector(connection=connection_string, embeddings=embeddings)
retriever = vector_store.as_retriever()
