In [1]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import azure.search.documents.indexes as indexes
import azure.search.documents.models as models
from azure.storage.blob import BlobServiceClient

load_dotenv(override=True)

True

Setup OpenAI Client

In [2]:
oai_key = os.getenv("AZURE_OPENAI_API_KEY")
oai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

openai_client = AzureOpenAI(
    api_key = oai_key,
    api_version = "2024-02-01",
    azure_endpoint = oai_endpoint
    )

print('API: ', oai_key, 'ENDPOINT: ', oai_endpoint)

API:  97c0e51f82664c5087879c951ab9ab81 ENDPOINT:  https://my-chatbox-1.openai.azure.com


Setup SearchClient

In [3]:
search_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
search_api = os.getenv("AZURE_SEARCH_API_KEY")
credential = AzureKeyCredential(search_api)
index_name = "my-search-index"

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)

print('API: ', search_api, 'ENDPOINT: ', search_endpoint)

API:  MBkuu4JSork3x8Bxn2OMSKsKpGjlMZvW1RfCIItLB1AzSeAjGIKs ENDPOINT:  https://my-search-123.search.windows.net


In [None]:
'''
    fields = [
        indexes.models.SimpleField(name="id", type=indexes.models.SearchFieldDataType.String, key=True),
        indexes.models.SearchableField(name="content", filterable=True),
        indexes.models.SearchableField(name="pages", filterable=True), 
                            #type=indexes.models.SearchFieldDataType.String,
                            #searchable=True),
        indexes.models.SearchField(name="contentVector",
                            type=indexes.models.SearchFieldDataType.Collection(indexes.models.SearchFieldDataType.Single),
                            searchable=True,
                            vector_search_dimensions=1536,
                            vector_search_profile_name='my_search_profile'),
        
    ],'''

Setup SearchIndexClient

In [36]:
index_client = indexes.SearchIndexClient(endpoint=search_endpoint, credential=credential)

search_index = indexes.models.SearchIndex(
    name = index_name,
    fields = [indexes.models.SearchField(
                name="chunk_id",
                type=indexes.models.SearchFieldDataType.String,
                key=True,
                hidden=False,
                filterable=True,
                sortable=True,
                facetable=False,
                searchable=True,
                analyzer_name="keyword"
            ),
            indexes.models.SearchField(
                name="parent_id",
                type=indexes.models.SearchFieldDataType.String,
                hidden=False,
                filterable=True,
                sortable=True,
                facetable=False,
                searchable=True
            ),
            indexes.models.SearchField(
                name="chunk",
                type=indexes.models.SearchFieldDataType.String,
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True
            ),
            indexes.models.SearchField(
                name="title",
                type=indexes.models.SearchFieldDataType.String,
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True
            ),
            indexes.models.SearchField(
                name="vector",
                type=indexes.models.SearchFieldDataType.Collection(indexes.models.SearchFieldDataType.Single),
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True,
                vector_search_dimensions=1536,
                vector_search_profile_name="my_search_profile"
            )],
    vector_search = indexes.models.VectorSearch(
        algorithms = [
            indexes.models.HnswAlgorithmConfiguration(
                name = 'my_hnsw',
                #kind = indexes.models.VectorSearchAlgorithmKind.HNSW,
                #parameters = indexes.models.HnswParameters(metric = 'cosine')
            )
        ],
        profiles = [
            indexes.models.VectorSearchProfile(
                name = 'my_search_profile',
                algorithm_configuration_name = 'my_hnsw'
            )
        ],
        #vectorizers=[  
            #indexes.models.AzureOpenAIVectorizer(  
                #name="myOpenAI",  
                #kind="azureOpenAI",  
                #azure_open_ai_parameters=indexes.models.AzureOpenAIParameters(  
                    #resource_uri=oai_endpoint,  
                    #deployment_id='text-embedding-ada-002',
                    #model_name='text-embedding-ada-002',
                    #api_key=oai_key,
                #),
            #),  
        #],  
    )
)

index_client.create_or_update_index(search_index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x19df6cf04d0>

Setup BlobStorageClient

In [5]:
connect_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
blob_client = BlobServiceClient.from_connection_string(connect_string)

container_name = "mycontainer"
blob_client.create_container(container_name)

<azure.storage.blob._container_client.ContainerClient at 0x19df6cd0d50>

Upload document in the database to Blob Storage

In [29]:
database = './database'
for file_name in os.listdir(database):
    blob_obj = blob_client.get_blob_client(container=container_name, blob=file_name)
    with open(os.path.join(database, file_name), "rb") as data:
        try:
            blob_obj.upload_blob(data)
        except:
            pass

Link AI Search to Blob Storage

In [37]:
indexer_client = indexes.SearchIndexerClient(endpoint=search_endpoint, credential=credential)
indexer_data = indexes.models.SearchIndexerDataContainer(name = container_name)

data_connection = indexes.models.SearchIndexerDataSourceConnection(
    name = "my-data-connection",
    type = indexes.models.SearchIndexerDataSourceType.azure_blob,
    container = indexer_data,
    connection_string = connect_string
)

indexer_client.create_or_update_data_source_connection(data_connection)

<azure.search.documents.indexes.models._models.SearchIndexerDataSourceConnection at 0x19df7b9fc10>

Set Indexer skillset

In [38]:
split_skill = indexes.models.SplitSkill(
    name = "Split Skill",
    default_language_code="en",
    context = '/document',
    text_split_mode = 'pages',
    maximum_page_length = 500,
    page_overlap_length = 100,
    inputs = [
        indexes.models.InputFieldMappingEntry(
            name = 'text',
            source = '/document/content'
        )
    ],
    outputs = [
        indexes.models.OutputFieldMappingEntry(
            name = 'textItems',
            target_name = 'pages'
        )
    ]
)
embedding_skill = indexes.models.AzureOpenAIEmbeddingSkill(
    name = "OpenAI Embedding Skill",
    context = '/document/pages/*',
    resource_uri = oai_endpoint,
    api_key = oai_key,
    deployment_id = 'text-embedding-ada-002',
    model_name = 'text-embedding-ada-002',
    dimensions = 1536,
    inputs = [
        indexes.models.InputFieldMappingEntry(
            name = 'text',
            source = '/document/pages/*'
        )
    ],
    outputs = [
        indexes.models.OutputFieldMappingEntry(
            name = 'embedding',
            target_name = 'vector'
        )
    ]
)
index_projections = indexes.models.SearchIndexerIndexProjections(  
        selectors=[  
            indexes.models.SearchIndexerIndexProjectionSelector(  
                target_index_name=index_name,  
                parent_key_field_name="parent_id",  
                source_context="/document/pages/*",  
                mappings=[
                    indexes.models.InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                    indexes.models.InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
                    indexes.models.InputFieldMappingEntry(name="title", source="/document/metadata_storage_name")
                ]
            )
        ],  
        parameters=indexes.models.SearchIndexerIndexProjectionsParameters(  
            projection_mode=indexes.models.IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
        )  
    )

skillset = indexes.models.SearchIndexerSkillset(
    name = 'my-skillset',
    description = 'Skillset for OpenAI Embedding',
    skills = [split_skill, embedding_skill],
    index_projections = index_projections
)

indexer_client.create_or_update_skillset(skillset)

<azure.search.documents.indexes.models._models.SearchIndexerSkillset at 0x19df6d67510>

Setup SearchIndexerClient

In [43]:
indexer_name = "my-search-indexer"
search_indexer = indexes.models.SearchIndexer(
    name = indexer_name,
    description = "Indexer for my-search-index",
    data_source_name = data_connection.name,
    target_index_name = search_index.name,
    skillset_name = skillset.name,
    schedule = indexes.models.IndexingSchedule(interval = "PT5M"),
)

indexer_client.create_or_update_indexer(search_indexer)

<azure.search.documents.indexes._generated.models._models_py3.SearchIndexer at 0x19df7385d50>

In [28]:
indexer_client.run_indexer(indexer_name)

In [44]:
def generate_embeddings(text, model):
    # Generate embeddings for the provided text using the specified model
    embeddings_response = openai_client.embeddings.create(model=model, input=text)
    # Extract the embedding data from the response
    embedding = embeddings_response.data[0].embedding
    return embedding

In [49]:
def chat(query):
    embed_model = "text-embedding-ada-002"

    vector_query = models.VectorizedQuery(vector=generate_embeddings(query, embed_model), 
                                        k_nearest_neighbors=3, 
                                        fields="vector")
    #print(vector_query)
    results = search_client.search(  
        search_text=query,  
        vector_queries = [vector_query]
    )
    docs = ''
    for result in results:
        docs = docs + result['chunk'] + '\n'

    prompt = '''INSTRUCTIONS: Answer the question using the information in the document provided.\n
    QUESTION: {query}.\n
    DOCUMENT: {document}'''.format(query=query, document=docs)

    response = openai_client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": "You are a HR manager at a tech company."},
            {"role": "user", "content": prompt}
        ]
    )

    #print(response)
    #print(response.model_dump_json(indent=2))
    print(response.choices[0].message.content)

Prompt input

In [52]:
chat("What is the most valueable skill that Hai have?")

The most valuable skill that Hai possesses is proficiency in developing machine learning models, particularly the ability to create optimized solutions that meet real-time speed and accuracy requirements. This is evidenced by Hai's work on the "Product Data Retrieval System," which involves both real-time recognition and the integration of a recommendation system to enhance user experience in retail environments. Additionally, Hai has demonstrated competencies in other machine learning applications such as customer segmentation and Q&A chatbot development using the RAG technique.
