In [1]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local").appName("DateManipulation").getOrCreate()
sc = spark.sparkContext

In [12]:
date_list = [
    ["2024/01/01 01:00 AM"],
    ["2024/02/03 03:00 AM"],
    ["2023/04/06 09:00 PM"],
    ["2022/08/02 06:00 PM"],
    ["2024/02/01 10:00 AM"]
]

date_schema = ["input_date"]
df= spark.createDataFrame(date_list,date_schema)
df.printSchema()

df.show()

root
 |-- input_date: string (nullable = true)

+-------------------+
|         input_date|
+-------------------+
|2024/01/01 01:00 AM|
|2024/02/03 03:00 AM|
|2023/04/06 09:00 PM|
|2022/08/02 06:00 PM|
|2024/02/01 10:00 AM|
+-------------------+



In [14]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [19]:
from pyspark.sql.functions import to_date,to_timestamp,date_format

#total available functions are 3 to_date,to_timestamp,date_format

df2 = df.withColumn("date",to_date(col("input_date"),"yyyy/MM/dd")) \
        .withColumn("time",to_timestamp("input_date","yyyy/MM/dd hh:mm a"))

df2.printSchema()

df2.show()

root
 |-- input_date: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: timestamp (nullable = true)

+-------------------+----------+-------------------+
|         input_date|      date|               time|
+-------------------+----------+-------------------+
|2024/01/01 01:00 AM|2024-01-01|2024-01-01 01:00:00|
|2024/02/03 03:00 AM|2024-02-03|2024-02-03 03:00:00|
|2023/04/06 09:00 PM|2023-04-06|2023-04-06 21:00:00|
|2022/08/02 06:00 PM|2022-08-02|2022-08-02 18:00:00|
|2024/02/01 10:00 AM|2024-02-01|2024-02-01 10:00:00|
+-------------------+----------+-------------------+



In [23]:
from pyspark.sql.functions import year,month,dayofmonth,hour,minute,second,dayofweek,quarter,weekofyear

df3 = df2.withColumn("year",year("time")) \
.withColumn("month",month("time")) \
.withColumn("day",dayofmonth("time")) \
.withColumn("hour",hour("time")) \
.withColumn("minute",minute("time")) \
.withColumn("dayofweek",dayofweek("time")) \
.withColumn("quarter",quarter("time")) \
.withColumn("weekofyear",weekofyear("time"))

df3.show()

+-------------------+----------+-------------------+----+-----+---+----+------+---------+-------+----------+
|         input_date|      date|               time|year|month|day|hour|minute|dayofweek|quarter|weekofyear|
+-------------------+----------+-------------------+----+-----+---+----+------+---------+-------+----------+
|2024/01/01 01:00 AM|2024-01-01|2024-01-01 01:00:00|2024|    1|  1|   1|     0|        2|      1|         1|
|2024/02/03 03:00 AM|2024-02-03|2024-02-03 03:00:00|2024|    2|  3|   3|     0|        7|      1|         5|
|2023/04/06 09:00 PM|2023-04-06|2023-04-06 21:00:00|2023|    4|  6|  21|     0|        5|      2|        14|
|2022/08/02 06:00 PM|2022-08-02|2022-08-02 18:00:00|2022|    8|  2|  18|     0|        3|      3|        31|
|2024/02/01 10:00 AM|2024-02-01|2024-02-01 10:00:00|2024|    2|  1|  10|     0|        5|      1|         5|
+-------------------+----------+-------------------+----+-----+---+----+------+---------+-------+----------+



In [25]:
df2.withColumn("dayofweek",dayofweek("time")) \
.withColumn("dayinwords",date_format("time", "EEEE")) \
.withColumn("monthinwords",date_format("time","LLLL")) \
.show()

#Use only three characters for EEE or LLL for truncated names and in four characters(EEEE/LLLL) for full

+-------------------+----------+-------------------+---------+----------+------------+
|         input_date|      date|               time|dayofweek|dayinwords|monthinwords|
+-------------------+----------+-------------------+---------+----------+------------+
|2024/01/01 01:00 AM|2024-01-01|2024-01-01 01:00:00|        2|    Monday|     January|
|2024/02/03 03:00 AM|2024-02-03|2024-02-03 03:00:00|        7|  Saturday|    February|
|2023/04/06 09:00 PM|2023-04-06|2023-04-06 21:00:00|        5|  Thursday|       April|
|2022/08/02 06:00 PM|2022-08-02|2022-08-02 18:00:00|        3|   Tuesday|      August|
|2024/02/01 10:00 AM|2024-02-01|2024-02-01 10:00:00|        5|  Thursday|    February|
+-------------------+----------+-------------------+---------+----------+------------+



In [28]:
from pyspark.sql.functions import current_date,date_add,date_sub,date_trunc,datediff

df2.withColumn("currentdate",current_date()) \
.withColumn("dateadd",date_add("time", 5)) \
.withColumn("datesub",date_sub("time", 5)) \
.withColumn("datediff",datediff(current_date(), "time")) \
.withColumn("date_trunc",date_trunc("mm", "time")) \
.show()


+-------------------+----------+-------------------+-----------+----------+----------+--------+-------------------+
|         input_date|      date|               time|currentdate|   dateadd|   datesub|datediff|         date_trunc|
+-------------------+----------+-------------------+-----------+----------+----------+--------+-------------------+
|2024/01/01 01:00 AM|2024-01-01|2024-01-01 01:00:00| 2024-01-01|2024-01-06|2023-12-27|       0|2024-01-01 00:00:00|
|2024/02/03 03:00 AM|2024-02-03|2024-02-03 03:00:00| 2024-01-01|2024-02-08|2024-01-29|     -33|2024-02-01 00:00:00|
|2023/04/06 09:00 PM|2023-04-06|2023-04-06 21:00:00| 2024-01-01|2023-04-11|2023-04-01|     270|2023-04-01 00:00:00|
|2022/08/02 06:00 PM|2022-08-02|2022-08-02 18:00:00| 2024-01-01|2022-08-07|2022-07-28|     517|2022-08-01 00:00:00|
|2024/02/01 10:00 AM|2024-02-01|2024-02-01 10:00:00| 2024-01-01|2024-02-06|2024-01-27|     -31|2024-02-01 00:00:00|
+-------------------+----------+-------------------+-----------+--------

In [29]:
help(date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start: 'ColumnOrName', days: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
    Returns the date that is `days` days after `start`. If `days` is a negative value
    then these amount of days will be deducted from `start`.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    start : :class:`~pyspark.sql.Column` or str
        date column to work on.
    days : :class:`~pyspark.sql.Column` or str or int
        how many days after the given date to calculate.
        Accepts negative value as well to calculate backwards in time.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        a date after/before given number of days.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Ro