In [None]:
from sentinel_lake.providers import MicrosoftSentinelProvider
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, LongType

data_provider = MicrosoftSentinelProvider(spark)
table_name = 'PurviewDataSensitivityLogs'
workspace_name = 'BOND-2-Log-Analytics'
data = data_provider.read_table(table_name, workspace_name)

# Target table
unique_purview_table = "UniquePurviewTable_SPRK"

# Delete existing table if needed
# try:
#     data_provider.delete_table(unique_purview_table, "default")
# except Exception as e:
#     print(f"Error deleting existing table: {e}")

# Read existing table if present
try:
    existing_table = data_provider.read_table(unique_purview_table, "default")
except Exception:
    existing_table = None

# Window for row_number per AssetPath (latest per asset)
window_spec = Window.partitionBy("AssetPath").orderBy(F.col("TimeGenerated").desc())

# UDF to sort array safely
def sort_array_safe(arr):
    if arr is None:
        return []
    return sorted([str(x) for x in arr])

sort_array_udf = F.udf(sort_array_safe, ArrayType(StringType()))

# Filter valid records
distinct_records = data.filter(
    (F.col("Classification").isNotNull()) &
    (~F.col("Classification").isin("", "[]")) &           # remove empty arrays stored as strings
    (F.col("ClassificationDetails").isNotNull()) &
    (F.length(F.col("ClassificationDetails")) > 2) &
    (F.col("TimeGenerated") >= F.expr("current_timestamp() - interval 1440 hours")) &
    (F.col("ActivityType") == "Classification") &
    (F.col("AssetType").isin("File", "Table"))
) \
.withColumn("row_num", F.row_number().over(window_spec)) \
.filter(F.col("row_num") == 1) \
.filter(~F.col("AssetPath").contains("#")) \
.withColumn("Classification_sorted", sort_array_udf(F.col("Classification"))) \
.withColumn(
    "id",
    F.sha2(
        F.concat_ws("|", F.col("AssetPath"), F.concat_ws(",", F.col("Classification_sorted"))),
        256
    )
) \
.select(
    F.col("AssetPath"),
    F.col("Classification"),
    F.col("ClassificationDetails"),
    F.col("TimeGenerated"),
    F.col("id"),
    F.col("AssetPath").alias("ExternalID"),
    F.col("SourceName"),
    F.col("SourceType"),
    F.col("SourceRegion"),
    F.col("AssetName"),
    F.col("AssetType"),
    F.col("ItOwner") if "ItOwner" in data.columns else F.lit("none@na.com").alias("ItOwner")
)

# Remove duplicates already in the target table
if existing_table:
    distinct_records = distinct_records.join(
        existing_table.select("id").alias("existing"),
        on="id",
        how="left_anti"
    )

# Determine last RowID
if existing_table is not None and "RowID" in existing_table.columns:
    last_rowid = existing_table.agg(F.max("RowID").alias("max_rowid")).collect()[0]["max_rowid"] or 0
else:
    last_rowid = 0

# Assign sequential RowID that continues from last saved record
rowid_window = Window.orderBy(F.col("TimeGenerated").asc())
distinct_records = distinct_records.withColumn(
    "RowID",
    (F.row_number().over(rowid_window) + F.lit(last_rowid)).cast(LongType())
)

# Show sample
distinct_records.show(10, False)

# Save records
record_count = distinct_records.count()
print("new_record_count:", record_count)

write_options = {'mode': 'append'}
if record_count > 0:
    data_provider.save_as_table(distinct_records, unique_purview_table, 'default', write_options)
    print(f"Appended {record_count} new records to {unique_purview_table}.")
else:
    print("No new records to append. Table is already up-to-date.")


StatementMeta(MSGSmall, 12, 32, Finished, Available, Finished)

{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Loading table: PurviewDataSensitivityLogs"}
{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Successfully loaded table PurviewDataSensitivityLogs"}
{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Loading table: UniquePurviewTable_SPRK"}
{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Successfully loaded table UniquePurviewTable_SPRK"}
+---+---------+--------------+---------------------+-------------------+----------+----------+----------+------------+---------+---------+-------+-----+
|id |AssetPath|Classification|ClassificationDetails|TimeGenerated [UTC]|ExternalID|SourceName|SourceType|SourceRegion|AssetName|AssetType|ItOwner|RowID|
+---+---------+--------------+---------------------+-------------------+----------+----------+----------+------------+---------+---------+-------+-----+
+---+---------+--------------

In [None]:
workspace_name = 'BOND-2-Log-Analytics' # to read from a specific workspace
df = data_provider.read_table(unique_purview_table)
df.show(100, False)

StatementMeta(MSGSmall, 12, 31, Finished, Available, Finished)

{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Loading table: UniquePurviewTable_SPRK"}
{"level": "INFO", "run_id": "ee2d2a1d-f9ee-4d4d-9ab4-cc15b15df632", "message": "Successfully loaded table UniquePurviewTable_SPRK"}
+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------