In [7]:
import json
from collections import defaultdict

def extract_entity_order(receipt):
    """
    Extracts the order of entities under each 'RECEIPT_ITEM' from a receipt,
    including their start and end indices.
    """
    orders = []
    receipt_items = []
    current_receipt_item_start = None

    for entity in receipt["entities"]:
        start, end, entity_type = entity
        
        if entity_type == "RECEIPT_ITEM":
            # If we encounter a new RECEIPT_ITEM, process the current one
            if receipt_items:
                # Sort entities by their start index
                sorted_entities = sorted(receipt_items, key=lambda x: x[0])
                orders.append(tuple(entity_type for _, _, entity_type in sorted_entities))
            
            # Start a new RECEIPT_ITEM
            receipt_items = [(start, end, entity_type)]
            current_receipt_item_start = start
        else:
            # Only add entities within the current RECEIPT_ITEM range
            if current_receipt_item_start is not None and start >= current_receipt_item_start:
                receipt_items.append((start, end, entity_type))

    # Append the last RECEIPT_ITEM order if there are any items left
    if receipt_items:
        sorted_entities = sorted(receipt_items, key=lambda x: x[0])
        orders.append(tuple(entity_type for _, _, entity_type in sorted_entities))
    
    return orders

def analyze_receipts(json_data):
    """
    Analyzes all receipts and counts the occurrences of entity sequences under RECEIPT_ITEM.
    """
    entity_order_count = defaultdict(int)  # To count occurrences of entity sequences

    # Process each receipt
    for receipt in json_data:
        orders = extract_entity_order(receipt)
        
        for order in orders:
            entity_order_count[order] += 1

    return entity_order_count

def main():
    # Read the JSON data from a file
    try:
        with open('../datasets/ner-swiss-receipts.json', 'r') as file:
            json_data = json.load(file)
    except FileNotFoundError:
        print("The file was not found. Please check the file path and name.")
        return
    except json.JSONDecodeError:
        print("The file contains invalid JSON. Please check the file content.")
        return

    # Analyze the receipts
    entity_order_count = analyze_receipts(json_data)

    # Print the results
    print("Entity order occurrences:")
    for order, count in entity_order_count.items():
        print(f"{order}: {count}")

if __name__ == "__main__":
    main()

Entity order occurrences:
('RECEIPT_ITEM', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_PRICE'): 116
('RECEIPT_ITEM', 'RECEIPT_ITEM_QUANTITY', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_PRICE'): 268
('RECEIPT_ITEM', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_QUANTITY', 'RECEIPT_ITEM_PRICE'): 12
('RECEIPT_ITEM', 'RECEIPT_ITEM_NAME'): 4
('RECEIPT_ITEM', 'RECEIPT_ITEM_QUANTITY', 'RECEIPT_ITEM_PRICE', 'RECEIPT_ITEM_NAME'): 6
('RECEIPT_ITEM', 'RECEIPT_ITEM_PRICE', 'RECEIPT_ITEM_NAME'): 8
('RECEIPT_ITEM', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_PRICE', 'RECEIPT_ITEM_QUANTITY'): 8
('RECEIPT_ITEM', 'RECEIPT_ITEM_QUANTITY', 'RECEIPT_ITEM_NAME'): 1
('RECEIPT_ITEM', 'RECEIPT_ITEM_QUANTITY', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_PRICE', 'RECEIPT_ITEM_PRICE'): 11
('RECEIPT_ITEM', 'RECEIPT_ITEM_NAME', 'RECEIPT_ITEM_PRICE', 'RECEIPT_ITEM_PRICE'): 1
