In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, array, size, expr, lit, trim, explode, array_intersect, length, row_number, current_timestamp, expr
from pyspark.sql.window import Window
from sentinel_lake.providers import MicrosoftSentinelProvider
spark = SparkSession.builder.getOrCreate()
data_provider = MicrosoftSentinelProvider(spark)


StatementMeta(MSGSmall, 6, 7, Finished, Available, Finished)

In [7]:
# Reading from PurviewDataSensitityLogs from data lake. 
df = data_provider.read_table("PurviewDataSensitivityLogs","BOND-2-Log-Analytics")
df.show(5, False)

StatementMeta(MSGSmall, 6, 8, Finished, Available, Finished)

{"level": "INFO", "run_id": "8f5c14f2-c77b-456f-aee5-af7ffb314b96", "message": "Loading table: PurviewDataSensitivityLogs"}
{"level": "INFO", "run_id": "8f5c14f2-c77b-456f-aee5-af7ffb314b96", "message": "Successfully loaded table PurviewDataSensitivityLogs"}
+------------------------------------+--------------------------+------------------------------------+------------------+-------------+--------------------+----------------+---------------------------------------------------------------------------------------------------------------+------------------------------------+------------+--------------------+------------+---------+----------------------------------------------------------------------------+---------+-------------------+-------------------+--------------------------+-------------+--------+--------------+---------------------+-----------------------------+----------------------------------------------------------------------------------------------------------------------

In [8]:
# get the latest scan results for each asset path
window_spec = Window.partitionBy("AssetPath").orderBy(col("TimeGenerated").desc())
df_latest = df.withColumn("row_num", row_number().over(window_spec)) \
              .filter(col("row_num") == 1) \
              .select("TimeGenerated", "AssetPath", "Classification")
df_latest.show(20, False)

StatementMeta(MSGSmall, 1, 10, Finished, Available, Finished)

+--------------------------+--------------------------------------------------------------------------------+--------------+
|TimeGenerated [UTC]       |AssetPath                                                                       |Classification|
+--------------------------+--------------------------------------------------------------------------------+--------------+
|2025-07-30 20:58:19.054748|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights             |[]            |
|2025-07-30 20:58:20.523668|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#ADDRESS     |[]            |
|2025-07-30 20:58:20.529545|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#CITY        |[]            |
|2025-07-30 20:58:20.505511|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#EMAIL       |[]            |
|2025-07-30 20:58:20.440493|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#FIRSTNAME   |[]            |


StatementMeta(MSGSmall, 2, 5, Finished, Available, Finished)

+--------------------------+--------------------------------------------------------------------------------+--------------+
|TimeGenerated [UTC]       |AssetPath                                                                       |Classification|
+--------------------------+--------------------------------------------------------------------------------+--------------+
|2025-07-30 20:58:19.054748|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights             |[]            |
|2025-07-30 20:58:20.523668|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#ADDRESS     |[]            |
|2025-07-30 20:58:20.529545|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#CITY        |[]            |
|2025-07-30 20:58:20.505511|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#EMAIL       |[]            |
|2025-07-30 20:58:20.440493|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#FIRSTNAME   |[]            |


StatementMeta(MSGSmall, 6, 9, Finished, Available, Finished)

+--------------------------+--------------------------------------------------------------------------------+--------------+
|TimeGenerated [UTC]       |AssetPath                                                                       |Classification|
+--------------------------+--------------------------------------------------------------------------------+--------------+
|2025-07-30 20:58:19.054748|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights             |[]            |
|2025-07-30 20:58:20.523668|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#ADDRESS     |[]            |
|2025-07-30 20:58:20.529545|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#CITY        |[]            |
|2025-07-30 20:58:20.505511|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#EMAIL       |[]            |
|2025-07-30 20:58:20.440493|mssql://bondsql.database.windows.net/CustomerDB/dbo/CustomerFlights#FIRSTNAME   |[]            |


In [9]:
# Filters out rows without classifications, Gets the latest scan results for each asset path. Returns columns: TimeGenerated, AssetPath, Classification
df_latest = df.filter(
                (col("Classification").isNotNull()) & 
                (length(col("Classification")) > 2) &
                (col("TimeGenerated") >= expr("current_timestamp() - interval 1440 hours")) # updated this to desired period, e.g., 24 hours
              ) \
              .withColumn("row_num", row_number().over(window_spec)) \
              .filter(col("row_num") == 1) \
              .select("AssetPath", "Classification")


df_latest.show(20, False)

StatementMeta(MSGSmall, 1, 11, Finished, Available, Finished)

+--------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|AssetPath                                                                       |Classification                                                                                                                                                                                                                                                                           |
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

StatementMeta(MSGSmall, 2, 6, Finished, Available, Finished)

+--------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|AssetPath                                                                       |Classification                                                                                                                                                                                                                                                                           |
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

StatementMeta(MSGSmall, 6, 10, Finished, Available, Finished)

+--------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|AssetPath                                                                       |Classification                                                                                                                                                                                                                                                                           |
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
def table_exists(data_provider, table_name, workspace=None):
    """
    Check if a table exists by attempting to read it and catching exceptions.
    """
    try:
        # Try to read just one row to minimize overhead
        if workspace:
            df_test = data_provider.read_table(table_name, workspace)
        else:
            df_test = data_provider.read_table(table_name)
        
        # Try to get the schema or count to ensure the table is accessible
        df_test.limit(1).collect()
        return True
    except Exception as e:
        # Log the error if needed
        print(f"Table '{table_name}' does not exist or is not accessible: {str(e)}")
        return False


    

StatementMeta(MSGSmall, 1, 12, Finished, Available, Finished)

StatementMeta(MSGSmall, 2, 7, Finished, Available, Finished)

StatementMeta(MSGSmall, 6, 11, Finished, Available, Finished)

In [20]:
# Check if baseline table exists
baseline_table_name = "PIIAssetPathTable_SPRK"
new_assets_table_name = "NewPIIAssets_SPRK"
workspace = "default"

if table_exists(data_provider, baseline_table_name, workspace):
    print(f"Baseline table '{baseline_table_name}' exists - checking for new assets")
    
    # Read existing baseline
    existing_baseline = data_provider.read_table(baseline_table_name, workspace)
    
    # Find new assets that don't exist in baseline (anti-join ensures uniqueness)
    new_assets = df_latest.join(existing_baseline, "AssetPath", "left_anti")
    
    new_asset_count = new_assets.count()
    
    if new_asset_count > 0:
        print(f"Found {new_asset_count} new assets")
        
        # Save new assets to NewPIIAssets table (overwrite to replace previous run's results)
        data_provider.save_as_table(
            new_assets,
            new_assets_table_name,
            write_options={'mode': 'overwrite'}
        )
        print(f"Saved new assets to '{new_assets_table_name}' table")
        
        # Append new assets to baseline table (much more efficient!)
        data_provider.save_as_table(
            new_assets,
            baseline_table_name,
            write_options={'mode': 'append'}
        )
        print(f"Appended {new_asset_count} new assets to baseline table '{baseline_table_name}'")
        
        # Show the new assets
        print("\nNew assets found:")
        new_assets.show(20, False)
        
    else:
        print("No new assets found - baseline is up to date")
        
        # Create/update NewPIIAssets table with empty DataFrame
        empty_new_assets = spark.createDataFrame([], df_latest.schema)
        data_provider.save_as_table(
            empty_new_assets,
            new_assets_table_name,
            write_options={'mode': 'overwrite'}
        )
        print(f"Updated '{new_assets_table_name}' table (no new assets)")

else:
    print(f"Baseline table '{baseline_table_name}' does not exist - creating new baseline")
    
    # Create new baseline table with all current assets
    data_provider.save_as_table(        
        df_latest,
        baseline_table_name,
        write_options={'mode': 'overwrite'}
    )
    print(f"Created baseline table '{baseline_table_name}' with {df_latest.count()} assets")
    
    # All assets are "new" since there was no baseline
    data_provider.save_as_table(
        df_latest,
        new_assets_table_name,
        write_options={'mode': 'overwrite'}
    )
    print(f"Created '{new_assets_table_name}' table with all {df_latest.count()} assets (first run)")

# Verify final state
print(f"\nFinal baseline table '{baseline_table_name}':")
final_baseline = data_provider.read_table(baseline_table_name, workspace)
print(f"Total assets in baseline: {final_baseline.count()}")

print(f"\nNew assets table '{new_assets_table_name}':")
final_new_assets = data_provider.read_table(new_assets_table_name, workspace)
print(f"New assets this run: {final_new_assets.count()}")
final_new_assets.show(10, False)

StatementMeta(MSGSmall, 2, 8, Finished, Available, Finished)

{"level": "INFO", "run_id": "6a249570-f8ac-4488-bc9e-686aa695f636", "message": "Loading table: PIIAssetPathTable_SPRK"}
{"level": "ERROR", "run_id": "6a249570-f8ac-4488-bc9e-686aa695f636", "message": "Error loading table PIIAssetPathTable_SPRK: Received Unauthorized status code with error {\"error\":{\"code\":\"Table_InsufficientPermissions\",\"message\":\"User does not have sufficient permissions to access table \\u0027PIIAssetPathTable_SPRK\\u0027 (permission: \\u0027r\\u0027).\",\"details\":[]}}"}
Table 'PIIAssetPathTable_SPRK' does not exist or is not accessible: Error loading table PIIAssetPathTable_SPRK: Received Unauthorized status code with error {"error":{"code":"Table_InsufficientPermissions","message":"User does not have sufficient permissions to access table \u0027PIIAssetPathTable_SPRK\u0027 (permission: \u0027r\u0027).","details":[]}}
Baseline table 'PIIAssetPathTable_SPRK' does not exist - creating new baseline
{"level": "INFO", "run_id": "6a249570-f8ac-4488-bc9e-686aa69

TableOperationException: Received Unauthorized status code with error {"error":{"code":"Table_InsufficientPermissions","message":"User does not have sufficient permissions to access table \u0027PIIAssetPathTable_SPRK\u0027 (permission: \u0027r\u0027).","details":[]}}