Process data and save them in vector store

# Embedding and vector store

* Data source: SEC filing reports

* Azure OpenAI - embedding

* FAISS

* Azure AI Search (Azure Cognitive Searc) - vector store and vector search, semantic search, or both

* LangChain framework - Azure OpenAI, Azure AI Search


## Import Langchain libraries and environment variables

In [2]:
# Import required libraries  
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from azure.search.documents.indexes.models import (
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField
)

## Configure OpenAI Settings

In [3]:
import os
import openai
from dotenv import load_dotenv
# Set up Azure OpenAI
load_dotenv()

openai.api_type = "azure"

AZURE_OPENAI_API_VERSION = os.getenv("AAG_AZURE_OPENAI_API_VERSION")
openai.api_version = AZURE_OPENAI_API_VERSION

AZURE_OPENAI_API_KEY = os.getenv("AAG_AZURE_OPENAI_API_KEY").strip()
assert AZURE_OPENAI_API_KEY, "ERROR: Azure OpenAI Key is missing"
openai.api_key = AZURE_OPENAI_API_KEY

AZURE_OPENAI_ENDPOINT = os.getenv("AAG_AZURE_OPENAI_ENDPOINT","").strip()
assert AZURE_OPENAI_ENDPOINT, "ERROR: Azure OpenAI Endpoint is missing"
openai.api_base = AZURE_OPENAI_ENDPOINT

# Deployment for Chat
# DEPLOYMENT_NAME_CHAT = os.getenv('DEPLOYMENT_NAME_CHAT')
DEPLOYMENT_NAME_CHAT = os.getenv('AAG_DEPLOYMENT_NAME_CHAT_16K')

# Deployment for embedding
DEPLOYMENT_NAME_EMBEDDING = os.getenv("AAG_DEPLOYMENT_NAME_EMBEDDING")
model: str = DEPLOYMENT_NAME_EMBEDDING

# Azure AI Search (Cognitive vector store)
vector_store_address: str = os.getenv("AAG_AZURE_SEARCH_SERVICE_ENDPOINT")  
vector_store_password: str = os.getenv("AAG_AZURE_SEARCH_ADMIN_KEY")
# index_name: str = "langchain-vector-arxiv-physics"

# Deployment for embedding
BING_SUBSCRIPTION_KEY = os.getenv("BING_SUBSCRIPTION_KEY")

## Load SEC data

* 10-K, 10-Q, 8-K

In [10]:
from langchain.document_loaders import PyPDFLoader

# Load pdf files
loader = PyPDFLoader("./data_source/zbra-20221231_10-K.pdf")
# loader = PyPDFLoader("./data_source/zbra-20230401_10-Q.pdf")
loaded_documents = loader.load()

In [None]:
loaded_documents

In [13]:
from langchain.text_splitter import CharacterTextSplitter

# Split documents to chucks
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
splitted_docs = text_splitter.split_documents(loaded_documents)


In [None]:
splitted_docs

## Create embeddings and vector store instances

### Option 1: FAISS vector store

In [104]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=16)
# Get Azure OpenAI embedding
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, 
                                                model=model, 
                                                chunk_size=1, 
                                                openai_api_base = AZURE_OPENAI_ENDPOINT, 
                                                openai_api_type = "azure", 
                                                api_key = AZURE_OPENAI_API_KEY)

# Create the vector index
db = FAISS.from_documents(splitted_docs, embeddings)
# Query the index
# query = "What did the president say about Ketanji Brown Jackson"
# docs = db.similarity_search(query)
# # Print the results
# print(docs[0].page_content)

In [101]:
db.save_local("faiss_index")
# new_db = FAISS.load_local("faiss_index", embeddings)
# docs = new_db.similarity_search(query)

### Option 2: Azure AI Search (Cognitive search)

* TODO: will do indexing later.  Need to watch for the cost, hold on for now

In [32]:
# Get Azure OpenAI embedding
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, model=model, 
                                                chunk_size=1, 
                                                openai_api_base = AZURE_OPENAI_ENDPOINT, 
                                                openai_api_type = "azure", 
                                                api_key = AZURE_OPENAI_API_KEY)
# Define index (aka embedding) name
index_name: str = "langchain-vector-zebra-10k-10q-8k"

# Create index in the vector store
azure_ai_search_vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    semantic_configuration_name='config',
        semantic_settings=SemanticSettings(
            default_configuration='config',
            configurations=[
                SemanticConfiguration(
                    name='config',
                    prioritized_fields=PrioritizedFields(
                        title_field=SemanticField(field_name='content'),
                        prioritized_content_fields=[SemanticField(field_name='content')],
                        prioritized_keywords_fields=[SemanticField(field_name='metadata')]
                    ))
            ])
    )

#### Insert text and embeddings into vector store - need to warch the code, so hold for now

In [None]:
# Execute following will start embedding ...
# azure_ai_search_vector_store.add_documents(documents=splitted_docs)