## This file creates the embeddings for the data source files

In [9]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import json
import initialize_openai_env as initialize

from langchain.text_splitter import CharacterTextSplitter

#### initialize the environment variables and OpenAPI

In [4]:
initialize.initialize()

In [None]:
#### define the function to fecth the first entry from the json
###### for the purpose of this simple example the json file contains a single entry for 
###### pdf file 

In [5]:
def get_first_pdf_file():
     data_sources_json_file_path = initialize.DATA_SOURCE_JSON_FILE
     file_full_path = ''                    
     try:
        with open(data_sources_json_file_path, 'r') as json_file:
            data = json.load(json_file)
            file_entries = data.get('filenames', [])

    
        filename = file_entries[0].get('filename', '')
        file_type = file_entries[0].get('file_type', '')
        file_location = file_entries[0].get('location', '')
        file_full_path = file_location + filename
        #print(f"Filename: {filename}, File Type: {file_type}, File Location: {file_location}, Full: {file_full_path}") 
     except FileNotFoundError:
        print(f"The file '{json_file_path}' was not found.")
     except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")         
     return file_full_path

In [None]:
#### create the faiss vector store for the pdf document

In [11]:
if __name__ == "__main__":
    embeddings=OpenAIEmbeddings(deployment=initialize.OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
                                model=initialize.OPENAI_ADA_EMBEDDING_MODEL_NAME,
                                openai_api_base=initialize.OPENAI_DEPLOYMENT_ENDPOINT,
                                openai_api_type=initialize.OPENAI_API_TYPE,
                                chunk_size=1)

    file_name = get_first_pdf_file()
   

    #use langchain PDF loader
    loader = PyPDFLoader(file_name)

    #split the document into chunks
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(pages) 
    
    #Use Langchain to create the embeddings using text-embedding-ada-002
    db = FAISS.from_documents(documents=docs, embedding=embeddings)

    #save the embeddings into FAISS vector store
    db.save_local(initialize.VECTOR_DB_STORE_LOCATION)