Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Integrate the AML Skill with Azure Search



This Notebook details the steps to add AMLSkill that refers to NER ML Model deployed to AKS.

Prerequisites
1. Enable AMLSkill in your subscription. To do this, follow this [link](https://docs.microsoft.com/en-us/azure/search/whats-new), Under Features announcements, June 2020, click on the link “To sign up for the AML skill preview, please [fill out the form.](https://forms.office.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR0jK7x7HQYdDm__YfEsbtcZUMTFGTFVTOE5XMkVUMFlDVFBTTlYzSlpLTi4u)”
2. AML model deployed to AKS


In [None]:
import requests
import json

Note:You can get all of the required values from 02_Initialize_Cognitive_Search.ipynb


In [None]:
# Configure all required variables for Cognitive Search. The required services are created as part of the initial resource deployment step
#Name of the Search Servcie
search_service_name = 'XXXXXXXXXXXXXXXXXXXX' 
# Replace with Search Service Admin key from the Azure portal
api_key = "XXXXXXXXXXXXXXXXXXXX" 
search_service = f"https://{search_service_name}.search.windows.net"
api_version = '2019-05-06-Preview'
content_type = 'application/json'

# replace with a cognitive services key, required but will not be billed. The cognitive services key unlocks the free tier restrictions
cog_svcs_key = 'XXXXXXXXXXXXXXXXXXXX' 
#Name of the storage account created during resource deployment step and its Key
STORAGEACCOUNTNAME= 'XXXXXXXXXXXXXXXXXXXX'
STORAGEACCOUNTKEY= 'XXXXXXXXXXXXXXXXXXXX'


# datasource_container should point to blob storage container that has the sample data to generate Labeled data set. In this example we named it labelgendata.
datasource_container = 'labelgendata' 
# This sample assumes you will use the same storage account for the datasource, knowledge store and indexer cache


#URL of the Label Skill container's web app. 
webapp = 'https://XXXX.azurewebsites.net'
#URL of the Markdown skill.You can get this from the Get Funtion Url button in the App Service Functions in Azure Portal
markdown_skill_fn = 'https://<app service name>.azurewebsites.net/api/<Function Name>?code=XXXXXXXXXXXXXXXXXXXXXXXXXXXX=='
#knowledge Store related variables
know_store_cache = f'DefaultEndpointsProtocol=https;AccountName={STORAGEACCOUNTNAME};AccountKey={STORAGEACCOUNTKEY};EndpointSuffix=core.windows.net'
DS_STORAGE_ACCT_CONN_STR = know_store_cache
assert ("==" in markdown_skill_fn )

In [None]:
def construct_Url(service, resource, resource_name, action, api_version):
    if resource_name:
        
        if action:
            return service + '/'+ resource + '/' + resource_name + '/' + action + '?api-version=' + api_version
        else:
            return service + '/'+ resource + '/' + resource_name + '?api-version=' + api_version
    else:
        return service + '/'+ resource + '?api-version=' + api_version


headers = {'api-key': api_key, 'Content-Type': content_type}
# Test out the URLs to ensure that the configuration works
print(construct_Url(search_service, "indexes", "custom-ner", "analyze", api_version))
print(construct_Url(search_service, "indexes", "custom-ner", None, api_version))
print(construct_Url(search_service, "indexers", None, None, api_version))


## 1.0 Update the Skillset
Update the skillset created earlier to include the AML skill. On the next run of the indexer, the indexer will identify the skillset update resulting in executing only the AML skill on all documents. The indexer cache will be used to re-hydrate the state of all documents.


###  To add the AML skill, navigate to the Azure portal:
1. Navigate to the search service
2. Click on the skillsets tab
3. Select the skillset created in step 2
4. Select the "Skillset Definition (JSON)" tab
5. From the add skill template, select the AML skill 
6. Select the workspace 
7. Select the endpoint
8. Copy the generated template, this should include your "uri" and "key" 

Edit the copied skill so the inputs and outputs look like the this:

```json
{
  "@odata.type": "#Microsoft.Skills.Custom.AmlSkill",
  "uri": "RESTEndPoint for AKS deployed AML service",
  "key": "Key to the AML Service endpoint",
  "timeout": "PT30S",
  "degreeOfParallelism": 0,
  "name": "",
  "description": "",
  "context": "",
  "inputs": [
        {
            "name": "raw_data",
            "sourceContext": "/document/contentmark",
            "inputs": [
                {
                    "name": "text",
                    "source": "/document/contentmark"
                }
            ]
        }
    ],
  "outputs": [
    {
      "name": "tags",
      "targetName": "tags"
    }
  ]
}
```

## 1.1 Add the skill to the skillset

Edit the skillset definition in the next cell to add the AML skill. Simply add the skill as the first skill in the skills array. The position of the skill is not important. Now edit the skill input and output to match the inputs and outputs as described in the sample above. 


In [None]:
skillset_name = f'{datasource_container}-ss'
skillset_def = {
        'name': f'{datasource_container}-ss',
        'description': 'km-aml solution accelerator-v1. A sample skillset to label and classify named entities',
        'skills': [
            # Add AML Skill here
            {
            '@odata.type': '#Microsoft.Skills.Custom.WebApiSkill',
            'name': 'MarkdownToText',
            'description' : 'Skill to clean markdown',
            'uri': f'{markdown_skill_fn}',
              'context': '/document',
              'inputs': [
                {
                  'name': 'doc',
                  'source': '/document/content'
                }
              ],
              'outputs': [
                {
                  'name': 'text',
                  'targetName': 'contentmark'
                }
              ]
            },
            {
             '@odata.type':'#Microsoft.Skills.Text.SplitSkill',
             'name':'SplitSkill',
             'description': None,
             'context':'/document',
             'defaultLanguageCode':'en',
             'textSplitMode':'pages',
             'maximumPageLength': 500,
             'inputs': [
                {
                 'name':'text',
                 'source':'/document/contentmark'
                }
              ],
             'outputs': [
                {
                 'name':'textItems',
                 'targetName':'pages'
                }
              ]
            },
            {
            '@odata.type': '#Microsoft.Skills.Custom.WebApiSkill',
            "name": "#2",
            'description': 'Skill to generate labels and POS data',
            'uri': f'{webapp}/label',
              'context': '/document',
              'inputs': [
                {
                  'name': 'doc',
                  'source': '/document/contentmark'
                }
              ],
              'outputs': [
                {
                  'name': 'result',
                  'targetName': 'results'
                }
              ]
          },
            {
            "@odata.type": "#Microsoft.Skills.Util.ConditionalSkill",
            "name": "#3",
            "description": "Filter sentences with a label",
            "context": "/document/results/*/annotations/*",
            "inputs": [
                {
                    "name": "condition",
                    "source": "= $(/document/results/*/annotations/*/label) == 'O'"
                },
                {
                    "name": "whenTrue",
                    "source": "= null"
                },
                {
                    "name": "whenFalse",
                    "source": "= 'hasLabel'"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "keep"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Util.ShaperSkill",
            "name": "#4",
            "description": "Aggregating the label at a senetence level",
            "context": "/document/results/*",
            "inputs": [
                {
                    "name": "keeps",
                    "source": "/document/results/*/annotations/*/keep"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "hasLabels"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Util.ConditionalSkill",
            "name": "#5",
            "description": "Filtering out the senetences with no labels",
            "context": "/document/results/*",
            "inputs": [
                {
                    "name": "condition",
                    "source": "= $(/document/results/*/hasLabels/keeps/0) == 'hasLabel'"
                },
                {
                    "name": "whenTrue",
                    "source": "/document/results/*"
                },
                {
                    "name": "whenFalse",
                    "source": "= null"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "labeled"
                }
            ]
        }
        ],
        'cognitiveServices': {
            '@odata.type': '#Microsoft.Azure.Search.CognitiveServicesByKey',
            'description': 'DemosCS',
            'key': f'{cog_svcs_key}'
        },
        'knowledgeStore': {
            'storageConnectionString': f'{know_store_cache}',
            'projections': [
                
                {
                    'tables': [],
                    'objects': [
                        {
                            "storageContainer": "labeled-data",
                            "referenceKeyName": None,
                            "generatedKeyName": "label-result",
                            "source": None,
                            "sourceContext": "/document/results",
                            "inputs": [
                                {
                                    "name": "labeled-data",
                                    "source": "/document/results/*/labeled"
                                },
                                {
                                    "name": "document_id",
                                    "source": "/document/metadata_storage_path"
                                }
                            ]
                        }
                    ],
                    'files': []
                }
            ]
        }
    }


r = requests.put(construct_Url(search_service, "skillsets", skillset_name, None, api_version), data=json.dumps(skillset_def),  headers=headers)
print(r.status_code)


## 2.0 Update Indexer
Update the indexer to map the new products and features properties generated by the AML skill to the products and features fields in the index.

We're going to GET the indexer, update the outputFieldMappings and PUT the indexer

In [None]:
indexername = f'{datasource_container}-idxr'
indexer_def = {
    "name": indexername,
    "description": "km-aml solution accelerator-v1. Indexer version to label named entities",
    "dataSourceName": f'{datasource_container}-ds',
    "skillsetName": f'{datasource_container}-ss',
    "targetIndexName": f'{datasource_container}-idx',
    "disabled": None,
    "schedule": {
        "interval": "PT1H",
        "startTime": "0001-01-01T00:00:00Z"
      },
    "parameters": {
        "batchSize": None,
        "maxFailedItems": 0,
        "maxFailedItemsPerBatch": 0,
        "base64EncodeKeys": None,
        "configuration": {
            "dataToExtract": "contentAndMetadata",
            "parsingMode": "default",
            "imageAction": "generateNormalizedImages"
        }
    },
    "fieldMappings": [
        {
            "sourceFieldName": "metadata_storage_path",
            "targetFieldName": "metadata_storage_path",
            "mappingFunction": {
                "name": "base64Encode",
                "parameters": None
            }
        }
    ],
    "outputFieldMappings": [
      {
            "sourceFieldName": "/document/tags/products/*",
            "targetFieldName": "products"
        },
        {
            "sourceFieldName": "/document/tags/features/*",
            "targetFieldName": "features"
        }
     
    ],
    "cache": {
        "enableReprocessing": True,
        "storageConnectionString": f'{know_store_cache}'
    }
}
r = requests.put(construct_Url(search_service, "indexers", indexername, None, api_version), data=json.dumps(indexer_def),  headers=headers)

In [None]:
#Run the indexer to reindex your dataset, the indexer will only execute the AML skill the next time it executes.

r = requests.post(construct_Url(search_service, "indexers", indexername, "run", api_version), data=None,  headers=headers)
print(r.status_code)
