# Email Content Classification with Azure Cognitive Search and Azure OpenAI

This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK to classify documents

## Prerequisites
1. To run the code, install the following packages. Please use the latest pre-release version `pip install azure-ai-formrecognizer==3.3.0`.


- > ! pip install azure-search-documents==11.4.0b6
- > ! pip install openai

## Load all the AOAI API keys and model parameters

In [1]:
import aoai

MY_AOAI_ENDPOINT = 'https://tr-non-prod-gpt4.openai.azure.com/'
MY_AOAI_VERSION = '2023-07-01-preview'
MY_GPT_ENGINE = 'tr-gpt4'
MY_AOAI_EMBEDDING_ENGINE = 'tr-embedding-ada'

status = aoai.setupOpenai(aoai_endpoint=MY_AOAI_ENDPOINT, 
                 aoai_version=MY_AOAI_VERSION)
if status > 0:
    print("AOAI setup succeeded")
else:
    print("AOAI setup failed")


Got OPENAI API Key from environment variable
AOAI setup succeeded


## Create the embeddings

In [5]:
import json
from tenacity import retry, wait_random_exponential, stop_after_attempt

#### Set the parameters

In [6]:
# TODO: Read from Blob Store
# Assuming you are running notebook from the notebook folder
MY_PROJECT_ROOT = r'..\..\..\data\sample-auto-insurance-emails\\'
MY_INPUT_DATA_FILE = f'{MY_PROJECT_ROOT}\cleaned-emails-with-classes-for-training.json'
MY_INPUT_DATA_AND_VECTOR_FILE = f'{MY_PROJECT_ROOT}\cleaned-emails-with-classes-and-vectors.json'

MY_TEST_DATA_FILE = f'{MY_PROJECT_ROOT}\cleaned-emails-with-classes-for-test.json'

# The different classes
categories = ["PolicyCancellation","IncisoCancellation","PersonChange",
                "VINNumberChange","CoverageChange","SubsequenteRegister",
                "PaymentMethodChange","UseChange","DiscountChange","VehicleChange",
                "BillingChange","VehicleDataChange","Transactionoutofscope"]

## Create the embeddings

In [57]:
# Read the text-sample.json
with open(MY_INPUT_DATA_FILE, 'r', encoding='utf-8') as file:
    input_data = json.load(file)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Generate embeddings for title and content fields
def add_embedding(text_data):
    return aoai.generate_embedding(the_engine = MY_AOAI_EMBEDDING_ENGINE,
                              the_text = text_data)
for item in input_data:
    item.pop('Index')
    item['FileName'] = item['FileName'].removesuffix('.pdf')
    for category in categories:
        item[category] = f'"{item[category]}"'
    content_embeddings = add_embedding(item['EmailBody'])
    item['contentVector'] = content_embeddings
    
# Output embeddings to docVectors.json file
with open(MY_INPUT_DATA_AND_VECTOR_FILE, "w") as f:
    json.dump(input_data, f)

## Authenticate to Azure Cognitive Search

In [58]:
import cog_search

cogSearchCredential = cog_search.getCogSearchCredential()


Got Azure Cognitive Search ADMIN API Key from environment variable


In [59]:
from azure.search.documents.indexes import SearchIndexClient

# Create a search index
MY_COG_SEARCH_ENDPOINT = 'https://tr-docai-cog-search.search.windows.net'
MY_COG_SEARCH_INDEX_NAME = 'sample-auto-insurance-emails-index'

#### Import all Cognitive Search Packages

In [60]:
from azure.search.documents import SearchClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)

#### Create the search index

In [74]:
fields = [
    SimpleField(name="FileName", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="EmailBody", type=SearchFieldDataType.String),
    SearchableField(name="PolicyCancellation", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="IncisoCancellation", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="PersonChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="VINNumberChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="CoverageChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="SubsequenteRegister", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="PaymentMethodChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="UseChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="DiscountChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="VehicleChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="BillingChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="VehicleDataChange", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Transactionoutofscope", type=SearchFieldDataType.String, filterable=True),   
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="FileName"),
        #prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_keywords_fields=[
                                    SemanticField(field_name="PolicyCancellation"),
                                    SemanticField(field_name="IncisoCancellation"),
                                    SemanticField(field_name="PersonChange"),
                                    SemanticField(field_name="VINNumberChange"),
                                    SemanticField(field_name="CoverageChange"),
                                    SemanticField(field_name="SubsequenteRegister"),
                                    SemanticField(field_name="PaymentMethodChange"),
                                    SemanticField(field_name="UseChange"),
                                    SemanticField(field_name="DiscountChange"),
                                    SemanticField(field_name="VehicleChange"),
                                    SemanticField(field_name="BillingChange"),
                                    SemanticField(field_name="VehicleDataChange"),
                                    SemanticField(field_name="Transactionoutofscope")
        ],
        prioritized_content_fields=[SemanticField(field_name="EmailBody")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])


# create an index client connection
index_client = SearchIndexClient(endpoint=MY_COG_SEARCH_ENDPOINT, 
                                 credential=cogSearchCredential)
# delete any existing index first to have a clean slate
index_client.delete_index(MY_COG_SEEARCH_INDEX_NAME)
# Create the search index with the semantic settings
index = SearchIndex(name=MY_COG_SEARCH_INDEX_NAME, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 sample-auto-insurance-emails-index created


#### Insert Email Content and Vectors into Cognitive Search Index

In [75]:
# Upload some documents to the index
with open(MY_INPUT_DATA_AND_VECTOR_FILE, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(
                    endpoint=MY_COG_SEARCH_ENDPOINT, 
                    index_name=MY_COG_SEARCH_INDEX_NAME, 
                    credential=cogSearchCredential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(documents)} documents") 

Uploaded 330 documents


## Vector Search

In [92]:
with open(MY_TEST_DATA_FILE, 'r', encoding='utf-8') as file:
    test_data = json.load(file)

for item in test_data:
    query = item['EmailBody']
    fileName = item['FileName']
    search_client = SearchClient(
                        endpoint=MY_COG_SEARCH_ENDPOINT, 
                        index_name=MY_COG_SEEARCH_INDEX_NAME, 
                        credential=cogSearchCredential)

    results = search_client.search(  
        search_text=None,  
        vector=aoai.generate_embedding(
                        the_engine=MY_AOAI_EMBEDDING_ENGINE,
                        the_text=query), 
        top_k=1,  
        vector_fields="contentVector",
        select=["FileName", "EmailBody", "PolicyCancellation","IncisoCancellation","PersonChange",
                "VINNumberChange","CoverageChange","SubsequenteRegister",
                "PaymentMethodChange","UseChange","DiscountChange","VehicleChange",
                "BillingChange","VehicleDataChange","Transactionoutofscope"],
    )  

    for result in results:
        print(f"----Test File[{fileName}] :: Nearest Neighbor File[{result['FileName']}]----")  
        print(f"Score: {result['@search.score']}\n")  
        print(f"PolicyCancellation[{result['PolicyCancellation']}]; IncisoCancellation[{result['IncisoCancellation']}]; PersonChange[{result['PersonChange']}]; VINNumberChange[{result['VINNumberChange']}]")
        
        print(f"CoverageChange[{result['CoverageChange']}]; SubsequenteRegister[{result['SubsequenteRegister']}]; PaymentMethodChange[{result['PaymentMethodChange']}]; UseChange[{result['UseChange']}]")
        
        print(f"DiscountChange[{result['DiscountChange']}]; VehicleChange[{result['VehicleChange']}]; BillingChange[{result['BillingChange']}]; VehicleDataChange[{result['VehicleDataChange']}]")
        
        print(f"Transactionoutofscope[{result['Transactionoutofscope']}]\n\n")


----Test File[7d8b0441-5018-40a9-852d-48f32d1acc79.pdf] :: Nearest Neighbor File[9d6306c3-a034-4d08-bcf0-82dc4540195b]----
Score: 0.9339431

PolicyCancellation["False"]; IncisoCancellation["False"]; PersonChange["False"]; VINNumberChange["False"]
CoverageChange["False"]; SubsequenteRegister["True"]; PaymentMethodChange["False"]; UseChange["True"]
DiscountChange["False"]; VehicleChange["False"]; BillingChange["False"]; VehicleDataChange["False"]
Transactionoutofscope["False"]


----Test File[ab8f5654-61b2-4eea-83a7-dccacdf52022.pdf] :: Nearest Neighbor File[0434a779-56a7-4a0a-8b07-68543dbcb73b]----
Score: 0.92652446

PolicyCancellation["False"]; IncisoCancellation["True"]; PersonChange["False"]; VINNumberChange["True"]
CoverageChange["False"]; SubsequenteRegister["False"]; PaymentMethodChange["False"]; UseChange["False"]
DiscountChange["True"]; VehicleChange["False"]; BillingChange["False"]; VehicleDataChange["False"]
Transactionoutofscope["False"]


----Test File[724b4f48-5e97-48c5-b05

## Semantic Hybrid Search to get the nearest vectors for the query

The categories of returned entry is the nearest categories for the query

In [78]:
# Read the text-sample.json
with open(MY_TEST_DATA_FILE, 'r', encoding='utf-8') as file:
    test_data = json.load(file)

for item in test_data:
    query = item['EmailBody']
    print(f'\n\n----Email text----\n{item["FileName"]}')
    search_client = SearchClient(
                        endpoint=MY_COG_SEARCH_ENDPOINT, 
                        index_name=MY_COG_SEEARCH_INDEX_NAME, 
                        credential=cogSearchCredential)

    results = search_client.search(
        search_text=query,
        vector=aoai.generate_embedding(
                        the_engine=MY_AOAI_EMBEDDING_ENGINE,
                        the_text=query), 
        top_k=1,  
        vector_fields="contentVector",
        select=["EmailBody"],
        query_type="semantic", 
        query_language="en-us", 
        semantic_configuration_name='my-semantic-config', 
        query_caption="extractive", 
        query_answer="extractive",
        top=1
    )
    #TODO Get the Category name instead of the key
    semantic_answers = results.get_answers()
    for answer in semantic_answers:
        print(f"Semantic Answer: {answer}")
        print(f"Semantic Answer Score: {answer.score}\n")
        print(f"Category: {answer.key}\n")



----Email text----
7d8b0441-5018-40a9-852d-48f32d1acc79.pdf


----Email text----
ab8f5654-61b2-4eea-83a7-dccacdf52022.pdf


----Email text----
724b4f48-5e97-48c5-b05e-2ae99eb4da34.pdf


----Email text----
6f6c1353-d3a7-4ccf-a3df-7ac45de7abbb.pdf


----Email text----
e7014199-7b44-49b4-9c66-49a17d2d2c81.pdf
