Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Initialize the Cognitive Search Service

The process of labelling data, training a model, and then using the model to extract entities from documents requires us to take multiple passes and in turn we'll create two separate indexes. In this first pass, the labeled data is projected into the knowledge store and is then imported as a dataset into AML to train a model.

The generalized model is then added to the skillset to extract entities so that the next index you create in step 4 will leverage the model to automatically populate the search index with the entities extracted.

# Prerequisites

1.	[Label Skill and Markdown Skill deployed](../01_Deploy_Skills)
2.	If you plan on using the sample data provided with this solution, follow the steps mentioned in "Create a dataset" section on [page](./)
 
We will refer the container(labelgendata) that has the sample data to generate labeled data for "datasource_container" variable mentioned below.


## Add Label Skill
This notebook will generate the enrichment pipeline with the label skill and knowledge store configured. The last cell runs the indexer to start enriching documents and produce the labeled dataset.

In [None]:
import requests
import json

## 1.0 Define the required variables

In [1]:
# Configure all required variables for Cognitive Search. The required services are created as part of the initial resource deployment step
#Name of the Search Service
search_service_name = 'XXXXXXXXXXXXXXXXXXXX' 
# Replace with Search Service Admin key from the Azure portal
api_key = "XXXXXXXXXXXXXXXXXXXX" 
search_service = f"https://{search_service_name}.search.windows.net"
api_version = '2019-05-06-Preview'
content_type = 'application/json'

# replace with a cognitive services key, required but will not be billed. The cognitive services key unlocks the free tier restrictions
cog_svcs_key = 'XXXXXXXXXXXXXXXXXXXX' 
#Name of the storage account created during resource deployment step and its Key
STORAGEACCOUNTNAME= 'XXXXXXXXXXXXXXXXXXXX'
STORAGEACCOUNTKEY= 'XXXXXXXXXXXXXXXXXXXX'


# datasource_container should point to blob storage container that has the sample data to generate labeled data set. In this example we named it labelgendata.
datasource_container = 'labelgendata' 
# This sample assumes you will use the same storage account for the datasource, knowledge store and indexer cache


#URL of the Label Skill container's web app. 
webapp = 'https://XXXX.azurewebsites.net'
#URL of the Markdown skill.You can get this from the Get Function Url button in the App Service Functions in Azure Portal
markdown_skill_fn = 'https://<app service name>.azurewebsites.net/api/<Function Name>?code=XXXXXXXXXXXXXXXXXXXXXXXXXXXX=='
#knowledge Store related variables
know_store_cache = f'DefaultEndpointsProtocol=https;AccountName={STORAGEACCOUNTNAME};AccountKey={STORAGEACCOUNTKEY};EndpointSuffix=core.windows.net'
DS_STORAGE_ACCT_CONN_STR = know_store_cache
assert ("==" in markdown_skill_fn )

In [None]:
def construct_Url(service, resource, resource_name, action, api_version):
    if resource_name:
        
        if action:
            return service + '/'+ resource + '/' + resource_name + '/' + action + '?api-version=' + api_version
        else:
            return service + '/'+ resource + '/' + resource_name + '?api-version=' + api_version
    else:
        return service + '/'+ resource + '?api-version=' + api_version


headers = {'api-key': api_key, 'Content-Type': content_type}
# Test out the URLs to ensure that the configuration works
print(construct_Url(search_service, "indexes", "custom-ner", "analyze", api_version))
print(construct_Url(search_service, "indexes", "custom-ner", None, api_version))
print(construct_Url(search_service, "indexers", None, None, api_version))


## 2.0 Create Datasource
This will extract the data that needs to be labelled. 

In [None]:
container = datasource_container # Replace with the container name 


datsource_def = {
    'name': f'{datasource_container}-ds',
    'description': f'km-aml solution accelerator- Datasource to label and extract custom entities from a corpus in {STORAGEACCOUNTNAME}, container {container}',
    'type': 'azureblob',
    'subtype': None,
    'credentials': {
        'connectionString': f'{DS_STORAGE_ACCT_CONN_STR}'
    },
    'container': {
        'name': f'{datasource_container}'
    },

}
r = requests.post(construct_Url(search_service, "datasources", None, None, api_version), data=json.dumps(datsource_def),  headers=headers)
print(r)
res = r.json()
print(json.dumps(res, indent=2))

## 3.0 Create SkillSet

Create the skillset to label your data. The skillset calls Markdown Parskel skill, the label skill, then shapes the data, then finally uses a condition skill to filter out sentences without any labels.

In [None]:
skillset_name = f'{datasource_container}-ss'
skillset_def = {
        'name': f'{datasource_container}-ss',
        'description': 'km-aml solution accelerator-v1. A sample skillset to label and classify named entities',
        'skills': [
            {
            '@odata.type': '#Microsoft.Skills.Custom.WebApiSkill',
            'name': 'MarkdownToText',
            'description' : 'Skill to clean markdown',
            'uri': f'{markdown_skill_fn}',
              'context': '/document',
              'inputs': [
                {
                  'name': 'doc',
                  'source': '/document/content'
                }
              ],
              'outputs': [
                {
                  'name': 'text',
                  'targetName': 'contentmark'
                }
              ]
            },
            {
             '@odata.type':'#Microsoft.Skills.Text.SplitSkill',
             'name':'SplitSkill',
             'description': None,
             'context':'/document',
             'defaultLanguageCode':'en',
             'textSplitMode':'pages',
             'maximumPageLength': 500,
             'inputs': [
                {
                 'name':'text',
                 'source':'/document/contentmark'
                }
              ],
             'outputs': [
                {
                 'name':'textItems',
                 'targetName':'pages'
                }
              ]
            },
            {
            '@odata.type': '#Microsoft.Skills.Custom.WebApiSkill',
            "name": "#2",
            'description': 'Skill to generate labels and POS data',
            'uri': f'{webapp}/label',
              'context': '/document',
              'inputs': [
                {
                  'name': 'doc',
                  'source': '/document/contentmark'
                }
              ],
              'outputs': [
                {
                  'name': 'result',
                  'targetName': 'results'
                }
              ]
          },
            {
            "@odata.type": "#Microsoft.Skills.Util.ConditionalSkill",
            "name": "#3",
            "description": "Filter sentences with a label",
            "context": "/document/results/*/annotations/*",
            "inputs": [
                {
                    "name": "condition",
                    "source": "= $(/document/results/*/annotations/*/label) == 'O'"
                },
                {
                    "name": "whenTrue",
                    "source": "= null"
                },
                {
                    "name": "whenFalse",
                    "source": "= 'hasLabel'"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "keep"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Util.ShaperSkill",
            "name": "#4",
            "description": "Aggregating the label at a senetence level",
            "context": "/document/results/*",
            "inputs": [
                {
                    "name": "keeps",
                    "source": "/document/results/*/annotations/*/keep"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "hasLabels"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Util.ConditionalSkill",
            "name": "#5",
            "description": "Filtering out the senetences with no labels",
            "context": "/document/results/*",
            "inputs": [
                {
                    "name": "condition",
                    "source": "= $(/document/results/*/hasLabels/keeps/0) == 'hasLabel'"
                },
                {
                    "name": "whenTrue",
                    "source": "/document/results/*"
                },
                {
                    "name": "whenFalse",
                    "source": "= null"
                }
            ],
            "outputs": [
                {
                    "name": "output",
                    "targetName": "labeled"
                }
            ]
        }
        ],
        'cognitiveServices': {
            '@odata.type': '#Microsoft.Azure.Search.CognitiveServicesByKey',
            'description': 'DemosCS',
            'key': f'{cog_svcs_key}'
        },
        'knowledgeStore': {
            'storageConnectionString': f'{know_store_cache}',
            'projections': [
                
                {
                    'tables': [],
                    'objects': [
                        {
                            "storageContainer": "labeled-data",
                            "referenceKeyName": None,
                            "generatedKeyName": "label-result",
                            "source": None,
                            "sourceContext": "/document/results",
                            "inputs": [
                                {
                                    "name": "labeled-data",
                                    "source": "/document/results/*/labeled"
                                },
                                {
                                    "name": "document_id",
                                    "source": "/document/metadata_storage_path"
                                }
                            ]
                        }
                    ],
                    'files': []
                }
            ]
        }
    }


r = requests.put(construct_Url(search_service, "skillsets", skillset_name, None, api_version), data=json.dumps(skillset_def),  headers=headers)
print(r)
res = r.json()
print(json.dumps(res, indent=2))

## 4.0 Create Index

In [None]:
indexname = f'{datasource_container}-idx'
index_def = {
    "name":f'{indexname}',
    "defaultScoringProfile": "",
    "fields": [
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_content_type",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_size",
            "type": "Edm.Int64",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_last_modified",
            "type": "Edm.DateTimeOffset",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_content_md5",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_name",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_path",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": True,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_content_type",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_language",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_author",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_title",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "entities",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "products",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "features",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "text",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "layoutText",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        }
    ],
    "scoringProfiles": [],
    "corsOptions": None,
    "suggesters": [],
    "analyzers": [],
    "tokenizers": [],
    "tokenFilters": [],
    "charFilters": [],
    "encryptionKey": None,
    "similarity": None
}
r = requests.post(construct_Url(search_service, "indexes", None, None, api_version), data=json.dumps(index_def),  headers=headers)
print(r)
res = r.json()
print(json.dumps(res, indent=2))

## 5.0 Create Indexer
Create the indexer. The indexer will automatically ingest all of the documents in your data source, run them through the enrichment pipeline, and then populates the search index.

In [None]:
indexername = f'{datasource_container}-idxr'
indexer_def = {
    "name": indexername,
    "description": "km-aml solution accelerator-v1. Indexer version to label named entities",
    "dataSourceName": f'{datasource_container}-ds',
    "skillsetName": f'{datasource_container}-ss',
    "targetIndexName": f'{datasource_container}-idx',
    "disabled": None,
    "schedule": {
        "interval": "PT1H",
        "startTime": "0001-01-01T00:00:00Z"
      },
    "parameters": {
        "batchSize": None,
        "maxFailedItems": -1,
        "maxFailedItemsPerBatch": -1,
        "base64EncodeKeys": None,
        "configuration": {
            "dataToExtract": "contentAndMetadata",
            "parsingMode": "default",
            "imageAction": "generateNormalizedImages"
        }
    },
    "fieldMappings": [
        {
            "sourceFieldName": "metadata_storage_path",
            "targetFieldName": "metadata_storage_path",
            "mappingFunction": {
                "name": "base64Encode",
                "parameters": None
            }
        }
    ],
    "outputFieldMappings": [
      
     
    ],
    "cache": {
        "enableReprocessing": True,
        "storageConnectionString": f'{know_store_cache}'
    }
}
r = requests.post(construct_Url(search_service, "indexers", None, None, api_version), data=json.dumps(indexer_def),  headers=headers)
print(r)
res = r.json()
print(json.dumps(res, indent=2))

## 6.0 Run the Indexer
Finally, run the indexer to generate the labeled data in the knowledge store.

In [None]:
r = requests.post(construct_Url(search_service, "indexers", indexername, "run", api_version), data=None,  headers=headers)
print(r)
#res = r.json()
#print(json.dumps(res, indent=2))

## 7.0 Get the Indexer Status

### Indexer will complete with status warning

The warning message Skill executed but may have unexpected results because one or more skill input was invalid. can be ignored. The message is generated when sentences without any matching labels are sent to the shaper skill.

In [None]:
r = requests.get(construct_Url(search_service, "indexers", indexername, "status", api_version), data=None,  headers=headers)
print(r)
res = r.json()
print(json.dumps(res, indent=2))