In [0]:
%python
# ====================
# 1. Set up credentials
# ====================
spark.conf.set(
    "fs.azure.account.key.mmixstorage.blob.core.windows.net",
    "UZTHs33FPYTUvC9G51zk+DQQp/FWf31YOteoW+dEnKuprRgxvk53yS+IpEiLn1062IBpOyoKaXp4+AStRcA1Cw=="
)

# Set legacy time parser policy to handle 'MM/dd/yyyy' format
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# ====================
# 2. Get notebook parameters
# ====================
dbutils.widgets.text("input_path", "")
dbutils.widgets.text("output_path", "")
dbutils.widgets.text("date_col", "")

input_path = dbutils.widgets.get("input_path")
output_path = dbutils.widgets.get("output_path")
date_col = dbutils.widgets.get("date_col")

base_blob_url = "wasbs://pre-processing@mmixstorage.blob.core.windows.net"

# Convert HTTPS URLs to WASBS Paths
def convert_to_wasbs(url):
    https_prefix = "https://mmixstorage.blob.core.windows.net/pre-processing/"
    return url.replace(https_prefix, f"{base_blob_url}/")

input_path = convert_to_wasbs(input_path)
output_path = convert_to_wasbs(output_path)

# ====================
# 3. Read input CSV
# ====================
df = spark.read.option("header", True).option("inferSchema", True).csv(input_path)

df.show(20)




# ====================
# 4. Define granularity detection function
# ====================
from pyspark.sql.functions import col, to_date, datediff, lag
from pyspark.sql.window import Window

def detect_date_granularity_spark(df_spark, date_column):

    #df = df_spark.withColumn(date_column, to_date(col(date_column), 'MM/dd/yyyy'))
    df = df_spark 
    
    df = df.dropna(subset=[date_column]).dropDuplicates([date_column])
    df = df.orderBy(col(date_column))

    df.select(date_column).show(10)  # Show first 10 rows of the date column

    
    w = Window().orderBy(date_column)
    df = df.withColumn("date_diff", 
    datediff(col(date_column), lag(date_column).over(w)))
    df_diff = df.filter(col("date_diff").isNotNull())
    
    mode_df = df_diff.groupBy("date_diff").count().orderBy(col("count").desc()).limit(1)
    
    mode_rows = mode_df.collect()

    mode_df.show()
    if not mode_rows:
        return None  # Return None if there's no mode value
    else:
        most_common_diff = mode_rows[0]["date_diff"]
        return most_common_diff
    


# ====================
# 5. Detect granularity
# ====================
most_common_diff = detect_date_granularity_spark(df, date_col)

# ====================
# 6. Determine granularity
# ====================
if most_common_diff is None:
    granularity = "Insufficient Data"
elif most_common_diff == 1:
    granularity = "Daily"
elif most_common_diff == 7:
    granularity = "Weekly"
elif most_common_diff in [28, 29, 30, 31]:
    granularity = "Monthly"
elif most_common_diff >= 365:
    granularity = "Yearly"
else:
    granularity = "Irregular"

# ====================
# 7. Write result to output path as a single-row CSV
# ====================
result_df = spark.createDataFrame([(granularity,)], ["granularity"])
result_df.write.mode("overwrite").option("header", True).csv(output_path)