In [None]:
from urllib.parse import quote
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
import os
import psycopg
import uuid

load_dotenv()

#### Environment Variables

In [82]:
username = os.getenv("USER_DB")
password = quote(os.getenv("PASSWORD_DB"))
host = os.getenv("HOST_DB")
database = os.getenv("DATABASE_DB")
port = os.getenv("PORT_DB")

### Loading the data

In [83]:
client = OpenAI()

class ExtractedData(BaseModel):
    questions_prefix: list[str]

async def extract_question_prefixes(pages):
    page_content = "\n".join([page.page_content for page in pages])

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Extract question prefixes."},
            {"role": "user", "content": f"Here is the text:\n\n{page_content}"},
        ],
        response_format=ExtractedData,
    )

    extracted_data = completion.choices[0].message.parsed
    
    questions_prefix = [question_prefix.strip() for question_prefix in extracted_data.questions_prefix]
    
    return questions_prefix

In [None]:
root_dir = "data"

async def process_pdfs(root_dir):
    all_pages = []
    prefix_by_docs = {}

    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith(".pdf"):
                file_path = os.path.join(dirpath, file)
                print(f"Loading: {file_path}")

                loader = PyPDFLoader(file_path)
                pages = loader.load_and_split()
                
                questions_prefix = await extract_question_prefixes(pages)
                
                prefix_by_docs[file_path] = questions_prefix
                
                print(file_path, questions_prefix)
                
                for page in pages:
                    page.metadata['contest'] = file_path.split("\\")[1] 
                    page.metadata['year'] = file_path.split("\\")[2] 
                    page.metadata['levels'] = file_path.split("\\")[3] 
                    page.metadata['subjects'] = file_path.split("\\")[4] if file_path.split("\\")[4] in ["Informatique", "Mathématiques"] else ""
                    page.metadata['type'] = file_path.split("\\")[5] 
                    page.metadata['exam'] = file_path.split("\\")[6].rstrip(".pdf")
                    
                    print(file_path, page.metadata)
                    
                all_pages.extend(pages)

    return all_pages, prefix_by_docs            
            

all_pages, prefix_by_docs = await process_pdfs("data")

In [None]:
# print(all_pages[0])
# all_pages[0].page_content
# all_pages[0].metadata
print(prefix_by_docs)

### Create the vector db

Llama embedding has a dimension of 4096.

In [None]:
# Test the connection
try:
    connection = f'postgresql://{username}:{password}@{host}:{port}/{database}'
    with psycopg.connect(conninfo=connection) as conn:
        print("Connection successful!")
except psycopg.OperationalError as e:
    print(f"OperationalError: {e}")
except Exception as e:
    print(f"Connection failed: {e}")

In [87]:
connection = f'postgresql://{username}:{password}@{host}:{port}/{database}'
collection_name = "rapports_jurys"
embeddings = OllamaEmbeddings(model="llama3.1") 

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

### Data preprocessing

LLaMa 3.1 has an input token limit of 4096 tokens. LLaMa uses Byte Pair Encoding tokenization.

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], 
    chunk_size=1200, 
    chunk_overlap=300)

chunks = text_splitter.split_documents(all_pages)

for chunk in chunks:
    questions_prefix = prefix_by_docs[chunk.metadata["source"]]
    questions_prefix_filtered =  " | ".join(question_prefix for question_prefix in questions_prefix if question_prefix in chunk.page_content)
    chunk.metadata["questions_prefix"] = questions_prefix_filtered
    
print(len(chunks))

### Inserting data

In [None]:
for num, chunk in enumerate(chunks):
    print(num)
    chunk.metadata["id"] = f"{chunk.metadata['source']}_{uuid.uuid4()}"
    vector_store.add_documents([chunk], ids=[chunk.metadata["id"]])

In [90]:
# Query to check in psql

# SELECT *
# FROM public.langchain_pg_embedding
# WHERE cmetadata->>'exam' = 'MATHEMATIQUES A'
#   AND cmetadata->>'type' = 'écrit'
#   AND cmetadata->>'year' = '2022'
#   AND cmetadata->>'levels' = 'MP'
#   AND cmetadata->>'contest' = 'X-ENS'
#   AND cmetadata->>'subjects' = 'Mathématiques'
#   AND cmetadata->'questions_prefix' @> '["12"]';