In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.window import Window
from pyspark.sql.functions import col, dense_rank, lag, round, regexp_replace

schema = StructType([
    StructField("Year", StringType(), True),
    StructField("Total_Demand", StringType(), True)
    # Add more fields as per your CSV structure
])

df = spark.read.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("abfss://gold@rmpyru.dfs.core.windows.net")

df_cleaned = df.dropDuplicates().orderBy(col("Year").asc())

# Create window spec to order by Year
#windowSpec = Window.orderBy("Year")
window_spec = Window.partitionBy("Year").orderBy(col("Year").asc())
df_with_lag = df_cleaned.withColumn("Previous_Demand", lag("Total_Demand").over(windowSpec))

# Cast columns to DoubleType for arithmetic operations
df_with_lag = df_with_lag.withColumn("Total_Demand", regexp_replace(col("Total_Demand"), ",", "").cast(DoubleType()))
df_with_lag.show()
df_with_lag = df_with_lag.withColumn("Previous_Demand", regexp_replace(col("Previous_Demand"), ",", "").cast(DoubleType()))
df_with_lag.show()
# Calculate YoY percentage change
df_result = df_with_lag.withColumn(
    "Change_YoY_Percent",
    round(((col("Total_Demand") - col("Previous_Demand")) / col("Previous_Demand")) * 100, 2)
)

# Save the final result as a table
df_result.write.mode("overwrite").saveAsTable("default.demand_forecast")

In [0]:
%sql
select * from default.demand_forecast