# Aggregate Knowledge Store Documents

The projected documents to the knowledge store only contain the sentences with a label. This notebook will aggregate all the sentences form the different documents into a single document that will be loaded into AML as a dataset

In [None]:
import json
import pandas as pd
from pandas.io.json import json_normalize 
import os
import base64
from posixpath import basename, dirname
from urllib.parse import urlparse
from azure.storage.blob import BlockBlobService
import sys

Storage container for Knowledge store is "labeled-data". This is defined in the knowledgeStore skill. This script reads the data from "labeled-data" and generate a labeled data set for training. You can configure how many total sentences you need using total_sentences variable mentioned below. sentences_per_file defines maximum sentences per file. If both are the same then this will script will generate single file. Otherwise it will generate multiple files which needs to be aggregated. 

In [None]:
azure_storage_account_name = 'XXXXXXXXXXXXX' # Knowledge store -- same as storage acct
azure_storage_account_key = 'XXXXXXXXXXXXX'
sentences_per_file=100000
total_sentences=100000

if azure_storage_account_name is None or azure_storage_account_key is None:
    raise Exception("Provide your specific name and key for your Azure Storage account--see the Prerequisites section earlier.")

In [None]:
def transform_to_csv( file_name):
    with open(file_name, encoding="utf8") as data_file:    
        data = json.load(data_file)  
    
    df = json_normalize(data, ['labeled-data', 'annotations'], [['labeled-data','sentence_count'], 'document_id'])
    if(df.empty):
        return (file_name, 0)
    else:
        
        df = df.rename(columns={"token": "Word", "label": "Tag"})
        df['labeled-data.sentence_count'] = df['labeled-data.sentence_count'].astype(str)
        df['sentence'] = df[['document_id', 'labeled-data.sentence_count']].agg('-'.join, axis=1)
        df.drop(['labeled-data.sentence_count', 'document_id'], axis=1, inplace=True)
        lines = df.groupby('sentence')['sentence'].nunique().count()
    return (df, lines)


In [None]:
def save_file(df, file_path, file_name, file_num,blob_service,labelleddata_container):
    df.to_csv(file_path, mode='a', index=False)
    blob_service.create_blob_from_path(labelleddata_container, file_name+str(file_num)+".csv", file_path)
    os.remove(file_path)
    
    df["line"]=df["Word"]+" " +df["Tag"]
    df.loc[(df["POS"]== "."),"line" ]=". O\n"
    with open(file_path, "w") as f:
        f.write("\n".join(list(df["line"])))
    blob_service.create_blob_from_path(labelleddata_container, file_name+str(file_num)+".txt", file_path)
    os.remove(file_path)


In [None]:
file_name = "ner_dataset" 
file_path="TempData"
labelleddata_container='labeled-data-df'
ks_container='labeled-data'

blob_service = BlockBlobService(azure_storage_account_name, azure_storage_account_key)
container_status = blob_service.create_container(labelleddata_container)
blobs = blob_service.list_blobs(ks_container)
sentences = 0
docs = 0
fileNum=1
linesNum=0
aggDF=pd.DataFrame()
sentenceID=0

for blob in blobs:
    docs = docs + 1
    sentenceVal=None
    # Read blob to a temp file and use pandas to convert to shape needed to train the model
    #print("blob.name---",blob.name)
    blob_service.get_blob_to_path(ks_container, blob.name, 'sample.json')
    (df, lines) = transform_to_csv('sample.json')

    if(lines!=0):
        for i, row in df.iterrows():
            if(df.at[i,'sentence']!=sentenceVal):
                sentenceID += 1                
                sentenceVal=df.at[i,'sentence']
                df.at[i,'sentence'] = sentenceID
            else:
                df.at[i,'sentence'] = sentenceID        
        linesNum += lines
        aggDF=pd.concat([aggDF, df])
        if(linesNum>sentences_per_file):
            save_file(aggDF, file_path, file_name, fileNum,blob_service,labelleddata_container)
            print(f'Succesfully generated labeled data with {sentences} sentences from {docs} documents')
            fileNum=fileNum+1
            aggDF=pd.DataFrame()
            linesNum=0

        sentences += lines
        if (sentences > total_sentences ): # 100000 samples should suffice to train the model
            if(total_sentences >sentences_per_file):
                save_file(aggDF, file_path, file_name, fileNum,blob_service,labelleddata_container)
                print(f'Succesfully generated labeled data with {sentences} sentences from {docs} documents')
            sys.exit("quitting as total sentences collected  so far exceeds the limit")    
if ( (sentences <sentences_per_file) or (linesNum<sentences_per_file and sentences>sentences_per_file)):
    save_file(aggDF, file_path, file_name, fileNum,blob_service,labelleddata_container)


print(f'Succesfully generated labeled data with {sentences} sentences from {docs} documents')

### You now have a labeled dataset!

#### Validate Results




Check your storage account, you should now have a new container ```labeled-data-df``` with files named ```ner_dataset1.csv``` and ```ner_dataset1.txt```. You might have more than these, if you decided to split the files.  These files have all the sentences with labels from all the files processed. If you have configured the script to generate multiple txt files, download all of them and aggregate them into one. 
CSV file has POS,Tag,Word and sentence columns. Text file has Word and Tag columns. 

For the next step, download the ```ner_dataset1.txt``` file. 