In [6]:
from sentinel_lake.providers import MicrosoftSentinelProvider
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, LongType

data_provider = MicrosoftSentinelProvider(spark)

def find_purview_logs_database(data_provider, target_table="PurviewDataSensitivityLogs"):
    """
    Find the first database that contains the target table.
    
    Args:
        data_provider: Object with .list_databases() and .list_tables(database).
        target_table (str): The table name to look for. Defaults to 'PurviewDataSensitivityLogs'.
    
    Returns:
        str | None: The database name if found, otherwise None.
    """
    for database in data_provider.list_databases():
        tables = {table.name for table in data_provider.list_tables(database)}
        if target_table in tables:
            return database
    return None

table_name = 'PurviewDataSensitivityLogs'
workspace_name = find_purview_logs_database(data_provider, table_name)

if workspace_name:
    print(f"Found '{workspace_name}' containing the table.")
else:
    print("Table not found in any database.")


StatementMeta(MSGSmall, 21, 7, Finished, Available, Finished)

Found 'BOND-2-Log-Analytics' containing the table.


In [11]:
data = data_provider.read_table(table_name, workspace_name)

# Target table
unique_purview_table = "UniquePurviewTable_SPRK"

# Delete existing table if needed
# try:
#     data_provider.delete_table(unique_purview_table, "default")
# except Exception as e:
#     print(f"Error deleting existing table: {e}")

# Read existing table if present
try:
    existing_table = data_provider.read_table(unique_purview_table, "default")
except Exception:
    existing_table = None

# Window for row_number per AssetPath (latest per asset)
window_spec = Window.partitionBy("AssetPath").orderBy(F.col("TimeGenerated").desc())

# UDF to sort array safely
def sort_array_safe(arr):
    if arr is None:
        return []
    return sorted([str(x) for x in arr])

# note: current UDF to sort does a string sort on Classification, not actual array because they are stored as a string in the table
sort_array_udf = F.udf(sort_array_safe, ArrayType(StringType()))

# Filter valid records
distinct_records = data.filter(
    (F.col("Classification").isNotNull()) &
    (~F.col("Classification").isin("", "[]")) &           # remove empty arrays (stored as strings in table)
    (F.col("ClassificationDetails").isNotNull()) &
    (F.length(F.col("ClassificationDetails")) > 2) &
    (F.col("TimeGenerated") >= F.expr("current_timestamp() - interval 1440 hours")) &
    (F.col("ActivityType") == "Classification") &
    (F.col("AssetType").isin("File", "Table"))
) \
.withColumn("row_num", F.row_number().over(window_spec)) \
.filter(F.col("row_num") == 1) \
.filter(~F.col("AssetPath").contains("#")) \
.withColumn("Classification_sorted", sort_array_udf(F.col("Classification"))) \
.withColumn(
    "id",
    F.sha2(
        F.concat_ws("|", F.col("AssetPath"), F.concat_ws(",", F.col("Classification_sorted"))),
        256
    )
) \
.select(
    F.col("AssetPath"),
    F.col("Classification"),
    F.col("ClassificationDetails"),
    F.col("TimeGenerated"),
    F.col("id"),
    F.col("AssetPath").alias("ExternalID"),
    F.col("SourceName"),
    F.col("SourceType"),
    F.col("SourceRegion"),
    F.col("AssetName"),
    F.col("AssetType"),
    F.col("ItOwner") if "ItOwner" in data.columns else F.lit("delete@placeholder.com").alias("ItOwner") # TODO: Set to null if not present
)

# Remove duplicates already in the target table
if existing_table:
    distinct_records = distinct_records.join(
        existing_table.select("id").alias("existing"),
        on="id",
        how="left_anti"
    )

distinct_records = distinct_records.orderBy(F.col("TimeGenerated").asc())

# Show sample
distinct_records.show(10, False)

# Save records
record_count = distinct_records.count()
print("new_record_count:", record_count)

write_options = {'mode': 'append'}
if record_count > 0:
    data_provider.save_as_table(distinct_records, unique_purview_table, 'default', write_options)
    print(f"Appended {record_count} new records to {unique_purview_table}.")
else:
    print("No new records to append. Table is already up-to-date.")


StatementMeta(MSGSmall, 21, 12, Finished, Available, Finished)

{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Loading table: PurviewDataSensitivityLogs"}
{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Successfully loaded table PurviewDataSensitivityLogs"}
{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Loading table: UniquePurviewTable_SPRK"}
{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Successfully loaded table UniquePurviewTable_SPRK"}
+---+---------+--------------+---------------------+-------------------+----------+----------+----------+------------+---------+---------+-------+
|id |AssetPath|Classification|ClassificationDetails|TimeGenerated [UTC]|ExternalID|SourceName|SourceType|SourceRegion|AssetName|AssetType|ItOwner|
+---+---------+--------------+---------------------+-------------------+----------+----------+----------+------------+---------+---------+-------+
+---+---------+--------------+-----------------

In [None]:
df = data_provider.read_table(unique_purview_table)
# Sort by RowID ascending
df = df.orderBy(F.col("TimeGenerated").asc())
df.show(20, False)

StatementMeta(MSGSmall, 21, 13, Finished, Available, Finished)

{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Loading table: UniquePurviewTable_SPRK"}
{"level": "INFO", "run_id": "635a30be-123e-456d-bee8-3a5d0a321536", "message": "Successfully loaded table UniquePurviewTable_SPRK"}
+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------