### Create dim_date data

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, lit, when

In [20]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("DimDateTable") \
    .getOrCreate()

25/03/25 16:42:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [21]:
# Create a DataFrame with a range of dates
start_date = '2000-01-01'
end_date = '2050-01-01'

In [22]:
date_range = spark.sql(f"""
    SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_range
""").selectExpr("explode(date_range) as date")

In [23]:
from pyspark.sql.functions import col, date_format, floor

# Extract date attributes with proper casting and use floor for integer division
dim_date_df = date_range \
    .withColumn('date_id', date_format(col('date'), 'yyyyMMdd')) \
    .withColumn('year', date_format(col('date'), 'yyyy')) \
    .withColumn('month', date_format(col('date'), 'MM').cast('int')) \
    .withColumn('day', date_format(col('date'), 'dd')) \
    .withColumn('quarter', (floor((col('month') - 1) / 3) + 1)) \
    .withColumn('weekday', date_format(col('date'), 'EEEE'))

# Show the DataFrame
dim_date_df.show()

+----------+--------+----+-----+---+-------+---------+
|      date| date_id|year|month|day|quarter|  weekday|
+----------+--------+----+-----+---+-------+---------+
|2000-01-01|20000101|2000|    1| 01|      1| Saturday|
|2000-01-02|20000102|2000|    1| 02|      1|   Sunday|
|2000-01-03|20000103|2000|    1| 03|      1|   Monday|
|2000-01-04|20000104|2000|    1| 04|      1|  Tuesday|
|2000-01-05|20000105|2000|    1| 05|      1|Wednesday|
|2000-01-06|20000106|2000|    1| 06|      1| Thursday|
|2000-01-07|20000107|2000|    1| 07|      1|   Friday|
|2000-01-08|20000108|2000|    1| 08|      1| Saturday|
|2000-01-09|20000109|2000|    1| 09|      1|   Sunday|
|2000-01-10|20000110|2000|    1| 10|      1|   Monday|
|2000-01-11|20000111|2000|    1| 11|      1|  Tuesday|
|2000-01-12|20000112|2000|    1| 12|      1|Wednesday|
|2000-01-13|20000113|2000|    1| 13|      1| Thursday|
|2000-01-14|20000114|2000|    1| 14|      1|   Friday|
|2000-01-15|20000115|2000|    1| 15|      1| Saturday|
|2000-01-1

### Save dim_date data in CSV File

In [24]:
# Save the DataFrame to a CSV file
dim_date_df.toPandas().to_csv("../../data/dim_date.csv", index=False)