In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window

### Data Reading

In [0]:
df = (
    spark
    .read
    .format('parquet')
    .load('abfss://bronze@databricksstorageete.dfs.core.windows.net//orders')
)

In [0]:
display(df)

Databricks data profile. Run in Databricks to view.

In [0]:
df.printSchema()

In [0]:
transformed_df = (
    df
    .withColumn('order_date', F.to_timestamp('order_date'))
    .withColumn('year', F.year('order_date'))
    .withColumn('month', F.year('order_date'))
    .withColumn('sales_dense_rank_by_year', F.dense_rank().over(Window.partitionBy('year').orderBy(F.desc('total_amount'))))
    .withColumn('sales_rank_by_year', F.rank().over(Window.partitionBy('year').orderBy(F.desc('total_amount'))))
)
transformed_df.display()

In [0]:
class MultiWindowColCreator:
    def dense_rank(self, df):
        return df.withColumn('sales_dense_rank_by_year', F.dense_rank().over(Window.partitionBy('year').orderBy(F.desc('total_amount'))))
    def rank(self, df):
        return df.withColumn('sales_rank_by_year', F.rank().over(Window.partitionBy('year').orderBy(F.desc('total_amount'))))


In [0]:
df_new = MultiWindowColCreator().dense_rank(transformed_df)
df_new.display()
df_new = MultiWindowColCreator().rank(transformed_df)
df_new.display()

### Data Writing

In [0]:
(
    transformed_df
    .write
    .format('delta')
    .mode('overwrite')
    .save('abfss://silver@databricksstorageete.dfs.core.windows.net/orders')
)

In [0]:
# dbutils.fs.rm("abfss://silver@databricksstorageete.dfs.core.windows.net/orders", recurse=True)

In [0]:
%sql
CREATE TABLE IF NOT EXISTS databricks_cata.silver.orders
USING DELTA
LOCATION 'abfss://silver@databricksstorageete.dfs.core.windows.net/orders';

In [0]:
%sql
SELECT * FROM databricks_cata.silver.orders;