Process data and save them in vector store

# Embedding and vector store

* Data source: SEC filing reports

* Azure OpenAI - embedding

* FAISS

* Azure AI Search (Azure Cognitive Searc) - vector store and vector search, semantic search, or both

* LangChain framework - Azure OpenAI, Azure AI Search


## Configure OpenAI Settings

In [None]:
import os
import openai
from dotenv import load_dotenv
# Set up Azure OpenAI
load_dotenv()

openai.api_type = "azure"

AZURE_OPENAI_API_VERSION = os.getenv("AAG_AZURE_OPENAI_API_VERSION")
openai.api_version = AZURE_OPENAI_API_VERSION

AZURE_OPENAI_API_KEY = os.getenv("AAG_AZURE_OPENAI_API_KEY").strip()
assert AZURE_OPENAI_API_KEY, "ERROR: Azure OpenAI Key is missing"
openai.api_key = AZURE_OPENAI_API_KEY

AZURE_OPENAI_ENDPOINT = os.getenv("AAG_AZURE_OPENAI_ENDPOINT","").strip()
assert AZURE_OPENAI_ENDPOINT, "ERROR: Azure OpenAI Endpoint is missing"
openai.api_base = AZURE_OPENAI_ENDPOINT

# Deployment for Chat
# DEPLOYMENT_NAME_CHAT = os.getenv('DEPLOYMENT_NAME_CHAT')
DEPLOYMENT_NAME_CHAT = os.getenv('AAG_DEPLOYMENT_NAME_CHAT_16K')

# Deployment for embedding
DEPLOYMENT_NAME_EMBEDDING = os.getenv("AAG_DEPLOYMENT_NAME_EMBEDDING")
model: str = DEPLOYMENT_NAME_EMBEDDING

# Azure AI Search (Cognitive vector store)
vector_store_address: str = os.getenv("AAG_AZURE_SEARCH_SERVICE_ENDPOINT")  
vector_store_password: str = os.getenv("AAG_AZURE_SEARCH_ADMIN_KEY")
# index_name: str = "langchain-vector-arxiv-physics"

# Deployment for embedding
BING_SUBSCRIPTION_KEY = os.getenv("BING_SUBSCRIPTION_KEY")

## Load travel itinerary

* Parking, Flight, Car Rental, Hotel ...

#### Load pdf files in a dir

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("./data_source/")

loaded_documents = loader.load()

In [None]:
loaded_documents

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Split documents to chucks
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
splitted_docs = text_splitter.split_documents(loaded_documents)


In [None]:
splitted_docs

### Create embeddings and vector store instances

#### FAISS vector store

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Get Azure OpenAI embedding
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    deployment=model,
    model=model,
    chunk_size=1,   # this 'chunk_size' is misleading, it is really about 'input' text string, not the number of words or characters in the text.
    openai_api_base=AZURE_OPENAI_ENDPOINT,
    openai_api_type="azure",
    api_key=AZURE_OPENAI_API_KEY,
)

# Create the vector index
db = FAISS.from_documents(splitted_docs, embeddings)
# Query the index
query = "What does the travel starts? Which terminal is it?"
docs = db.similarity_search(query)
# Print the results
print(docs[0].page_content)

In [None]:
# Travel
db.save_local("faiss_index_travel")