In [None]:
!pip3 install langchain_postgres langchain psycopg2 python-dotenv langchainhub boto3 unstructured transformers


In [58]:
# rag.py file

from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()

# Establish connection to PostgreSQL database using environment variables
conn = psycopg2.connect(
       database=os.getenv("SCW_DB_NAME"),
       user=os.getenv("SCW_DB_USER"),
       password=os.getenv("SCW_DB_PASSWORD"),
       host=os.getenv("SCW_DB_HOST"),
       port=os.getenv("SCW_DB_PORT")
   )

# Create a cursor to execute SQL commands
cur = conn.cursor()

In [59]:
# rag.py

from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector

In [60]:
# rag.py

embeddings = OpenAIEmbeddings(
                openai_api_key=os.getenv("SCW_SECRET_KEY"),
                openai_api_base=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
                model="sentence-transformers/sentence-t5-xxl",
                tiktoken_enabled=False
            )

In [61]:
# rag.py

connection_string = f"postgresql+psycopg2://{conn.info.user}:{conn.info.password}@{conn.info.host}:{conn.info.port}/{conn.info.dbname}"
vector_store = PGVector(connection=connection_string, embeddings=embeddings)

In [62]:
#rag.py

import boto3
from langchain_community.document_loaders import S3FileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [63]:
# rag.py

session = boto3.session.Session()
client_s3 = session.client(service_name='s3', endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                               aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                               aws_secret_access_key=os.getenv("SCW_SECRET_KEY", ""))
paginator = client_s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=os.getenv("SCW_BUCKET_NAME", ""))

In [64]:
# rag.py

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, add_start_index=True, length_function=len, is_separator_regex=False)
for page in page_iterator:
    for obj in page.get('Contents', []):
        print (obj['Key'])
        cur.execute("SELECT object_key FROM object_loaded WHERE object_key = %s", (obj['Key'],))
        response = cur.fetchone()
        if response is None:

            file_loader = S3FileLoader(
                    bucket=os.getenv("SCW_BUCKET_NAME", ""),
                    key=obj['Key'],
                    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
                    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
                    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", "")
                )
            file_to_load = file_loader.load()
            cur.execute("INSERT INTO object_loaded (object_key) VALUES (%s)", (obj['Key'],))
            chunks = text_splitter.split_text(file_to_load[0].page_content)
            try:
                embeddings_list = [embeddings.embed_query(chunk) for chunk in chunks]
                vector_store.add_embeddings(chunks, embeddings_list)
                cur.execute("INSERT INTO object_loaded (object_key) VALUES (%s)", (obj['Key'],))
            except Exception as e:
                print(f"An error occurred: {e}")

conn.commit()

arrivee_scw/bienvenue.txt
arrivee_scw/buddies.txt
arrivee_scw/reglements.txt
carriere/entretiens.txt
carriere/formation.txt
carriere/mobilite.txt
carriere/relation_manager.txt
carriere/talent_review.txt
dialogue_social/accord_entreprise.txt
An error occurred: (psycopg2.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).

[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)
dialogue_social/cse.txt
dialogue_social/doc_legal.txt
engagements/comm_interne.txt
engagements/marque_employeur.txt
engagements/rse.txt
quotidien/avantage.txt
quotidien/conges.txt
quotidien/contact.txt
quotidien/frais.txt
quotidien/maison.txt
quotidien/oranisation_travail.txt
quotidien/outils_hr.txt
quotidi

In [52]:
#rag.py

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI


In [54]:
#rag.py

llm = ChatOpenAI(
        base_url=os.getenv("SCW_GENERATIVE_APIs_ENDPOINT"),
        api_key=os.getenv("SCW_SECRET_KEY"),
        model="llama-3.1-8b-instruct",
        )

prompt = hub.pull("rlm/rag-prompt")
retriever = vector_store.as_retriever()


rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

for r in rag_chain.stream("Quelle est la politique RSE de scaleway?"):
    print(r, end="", flush=True)



Je n'ai pas trouvé d'information spécifique sur la politique RSE (Responsabilité Sociale des Entreprises) de Scaleway dans le contexte fourni.

arrivee_scw/bienvenue.txt
 
arrivee_scw/buddies.txt

arrivee_scw/reglements.txt

bienvenue.txt

carriere/entretiens.txt

carriere/formation.txt

carriere/mobilite.txt

carriere/relation_manager.txt

carriere/talent_review.txt

dialogue_social/accord_entreprise.txt

An error occurred: (psycopg2.errors.NotNullViolation) null value in column "id" of relation "langchain_pg_embedding" violates not-null constraint
DETAIL:  Failing row contains (null, null, null, null, null).

[SQL: INSERT INTO langchain_pg_embedding DEFAULT VALUES ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

dialogue_social/cse.txt

dialogue_social/doc_legal.txt

engagements/comm_interne.txt

engagements/marque_employeur.txt

engagements/rse.txt

quotidien/avantage.txt

In [1]:
import os
import boto3

# Initialize the S3 client
session = boto3.session.Session()
client_s3 = session.client(
    service_name='s3',
    endpoint_url=os.getenv("SCW_BUCKET_ENDPOINT", ""),
    aws_access_key_id=os.getenv("SCW_ACCESS_KEY", ""),
    aws_secret_access_key=os.getenv("SCW_SECRET_KEY", "")
)

# Define the local folder and S3 bucket name
local_folder = 'data_context'

# Walk through all files in the folder and upload them to S3
for root, dirs, files in os.walk(local_folder):
    for file in files:
        file_path = os.path.join(root, file)
        s3_key = os.path.relpath(file_path, local_folder)  # S3 object key
        
        try:
            client_s3.upload_file(file_path, os.getenv("SCW_BUCKET_NAME", ""), s3_key)
            print(f'Successfully uploaded {file_path} to {os.getenv("SCW_BUCKET_NAME", "")}/{s3_key}')
        except Exception as e:
            print(f'Failed to upload {file_path}. Error: {str(e)}')


Successfully uploaded data_context/test.txt to rag-poc-1/test.txt
Successfully uploaded data_context/dialogue_social/accord_entreprise.txt to rag-poc-1/dialogue_social/accord_entreprise.txt
Successfully uploaded data_context/dialogue_social/cse.txt to rag-poc-1/dialogue_social/cse.txt
Successfully uploaded data_context/dialogue_social/doc_legal.txt to rag-poc-1/dialogue_social/doc_legal.txt
Successfully uploaded data_context/engagements/comm_interne.txt to rag-poc-1/engagements/comm_interne.txt
Successfully uploaded data_context/engagements/rse.txt to rag-poc-1/engagements/rse.txt
Successfully uploaded data_context/engagements/marque_employeur.txt to rag-poc-1/engagements/marque_employeur.txt
Successfully uploaded data_context/carriere/entretiens.txt to rag-poc-1/carriere/entretiens.txt
Successfully uploaded data_context/carriere/mobilite.txt to rag-poc-1/carriere/mobilite.txt
Successfully uploaded data_context/carriere/formation.txt to rag-poc-1/carriere/formation.txt
Successfully upl