## Part 2: Advanced Date Functions in PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, current_timestamp, date_add, date_sub, datediff, months_between

# Create Spark session
spark = SparkSession.builder.appName("PySparkDateFunctions").getOrCreate()

# Sample data
data = [(1, "2024-01-01"), (2, "2023-06-15"), (3, "2022-12-31")]
columns = ["ID", "Date"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Convert column to date type
df = df.withColumn("Date", col("Date").cast("date"))

# Show DataFrame
df.show()

StatementMeta(, e65d5013-c34b-4f87-ac91-deb7889b84cb, 4, Finished, Available, Finished)

+---+----------+
| ID|      Date|
+---+----------+
|  1|2024-01-01|
|  2|2023-06-15|
|  3|2022-12-31|
+---+----------+



In [3]:
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, weekofyear, date_format

# Extracting date parts
df_extracted = df.select(
    col("Date"),
    year(col("Date")).alias("Year"),
    month(col("Date")).alias("Month"),
    dayofmonth(col("Date")).alias("DayOfMonth"),
    dayofweek(col("Date")).alias("DayOfWeek"),
    weekofyear(col("Date")).alias("WeekOfYear")
)
df_extracted.show()

StatementMeta(, e65d5013-c34b-4f87-ac91-deb7889b84cb, 5, Finished, Available, Finished)

+----------+----+-----+----------+---------+----------+
|      Date|Year|Month|DayOfMonth|DayOfWeek|WeekOfYear|
+----------+----+-----+----------+---------+----------+
|2024-01-01|2024|    1|         1|        2|         1|
|2023-06-15|2023|    6|        15|        5|        24|
|2022-12-31|2022|   12|        31|        7|        52|
+----------+----+-----+----------+---------+----------+



### Formatting Dates
- **`date_format(date, format)`**: Converts date into a formatted string.

Example formats:
- `yyyy-MM-dd` (Year-Month-Day)
- `MMMM dd, yyyy` (Month Day, Year)
- `E, MMM dd yyyy` (Day, Month Day Year)


In [4]:
df_formatted = df.select(
    col("Date"),
    date_format(col("Date"), "yyyy-MM-dd").alias("Formatted_YYYY_MM_DD"),
    date_format(col("Date"), "MMMM dd, yyyy").alias("Formatted_Long"),
    date_format(col("Date"), "E, MMM dd yyyy").alias("Formatted_Short")
)
df_formatted.show(truncate=False)

StatementMeta(, e65d5013-c34b-4f87-ac91-deb7889b84cb, 6, Finished, Available, Finished)

+----------+--------------------+-----------------+----------------+
|Date      |Formatted_YYYY_MM_DD|Formatted_Long   |Formatted_Short |
+----------+--------------------+-----------------+----------------+
|2024-01-01|2024-01-01          |January 01, 2024 |Mon, Jan 01 2024|
|2023-06-15|2023-06-15          |June 15, 2023    |Thu, Jun 15 2023|
|2022-12-31|2022-12-31          |December 31, 2022|Sat, Dec 31 2022|
+----------+--------------------+-----------------+----------------+

