In [0]:
import pyspark.sql.functions as f

## Create dataframe

In [0]:
df = spark.read.format("csv").option("header", True).load("/FileStore/tables/store_sales/test.csv")

## View Schema

In [0]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## View Data

In [0]:
df.show(5)

+-------+----------+---------+----------+-----------+
|     id|      date|store_nbr|    family|onpromotion|
+-------+----------+---------+----------+-----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|
|3000889|2017-08-16|        1| BABY CARE|          0|
|3000890|2017-08-16|        1|    BEAUTY|          2|
|3000891|2017-08-16|        1| BEVERAGES|         20|
|3000892|2017-08-16|        1|     BOOKS|          0|
+-------+----------+---------+----------+-----------+
only showing top 5 rows



## Converting string to date

In [0]:
df = df.withColumn("date", f.to_date(f.col("date"), "yyyy-MM-dd"))

In [0]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## Adding a timestamp column

In [0]:
df = df.withColumn("Last_Updated", f.lit(f.current_timestamp()))

In [0]:
df.dtypes

Out[136]: [('id', 'string'),
 ('date', 'date'),
 ('store_nbr', 'string'),
 ('family', 'string'),
 ('onpromotion', 'string'),
 ('Last_Updated', 'timestamp')]

### date_add and date_sub
add or subtract number of days from a column. This will create a column with ```date``` type.

In [0]:
help(f.date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start: 'ColumnOrName', days: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
    Returns the date that is `days` days after `start`
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 9))]
    >>> df.select(date_add(df.dt, df.add.cast('integer')).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 10))]



In [0]:
df.columns

Out[138]: ['id', 'date', 'store_nbr', 'family', 'onpromotion', 'Last_Updated']

In [0]:
df.withColumn("next_date", f.date_add(f.col("date"), 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated| next_date|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-17|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-08-17|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-08-17|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-08-17|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-08-17|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-08-17|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-08-17|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
# We can also use negative value with date_add
df.withColumn("previous_date", f.date_add(f.col("date"), -1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|previous_date|
+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|   2017-08-15|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|   2017-08-15|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|   2017-08-15|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|   2017-08-15|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|   2017-08-15|
|3000895|2017-08-16|

In [0]:
# Variation of column object being passed
df.withColumn("next_date", f.date_add(df.date, 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated| next_date|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-17|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-08-17|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-08-17|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-08-17|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-08-17|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-08-17|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-08-17|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
# Applying function on timestamp column
df.withColumn("next_date_timestamp", f.date_add(df.Last_Updated, 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_date_timestamp|
+-------+----------+---------+-------------------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|         2023-01-09|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|         2023-01-09|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|         2023-01-09|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|         2023-01-09|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|         2023-01-09|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|         2023-01-09|
|3000894|2017-08-16|        1|        CELEBRATION|          0|20

In [0]:
df.withColumn("previous_date", f.date_sub(f.col("date"), 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|previous_date|
+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|   2017-08-15|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|   2017-08-15|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|   2017-08-15|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|   2017-08-15|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|   2017-08-15|
|3000895|2017-08-16|

In [0]:
df.withColumn("previous_date", f.date_sub("date", 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|previous_date|
+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|   2017-08-15|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|   2017-08-15|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|   2017-08-15|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|   2017-08-15|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|   2017-08-15|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|   2017-08-15|
|3000895|2017-08-16|

In [0]:
df.withColumn("previous_date", f.date_sub(df["Last_Updated"], 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|previous_date|
+-------+----------+---------+-------------------+-----------+--------------------+-------------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|   2023-01-07|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|   2023-01-07|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|   2023-01-07|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|   2023-01-07|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|   2023-01-07|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|   2023-01-07|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|   2023-01-07|
|3000895|2017-08-16|

## datediff
Important think to remember is the seq of arguments. First one is ```end``` while the 2nd one is ```start``` as stated in the documentation.

In [0]:
help(f.datediff)

Help on function datediff in module pyspark.sql.functions:

datediff(end: 'ColumnOrName', start: 'ColumnOrName') -> pyspark.sql.column.Column
    Returns the number of days from `start` to `end`.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2'])
    >>> df.select(datediff(df.d2, df.d1).alias('diff')).collect()
    [Row(diff=32)]



In [0]:
df.withColumn("days_ago", f.datediff(df.Last_Updated, df.date)).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|days_ago|
+-------+----------+---------+----------+-----------+--------------------+--------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|    1971|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|    1971|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|    1971|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|    1971|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|    1971|
+-------+----------+---------+----------+-----------+--------------------+--------+
only showing top 5 rows



In [0]:
df.withColumn("days_ago", f.datediff(df.date, df.Last_Updated)).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|days_ago|
+-------+----------+---------+----------+-----------+--------------------+--------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   -1971|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   -1971|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   -1971|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   -1971|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   -1971|
+-------+----------+---------+----------+-----------+--------------------+--------+
only showing top 5 rows



In [0]:
# Using kwargs to make sure function is well read and performs as expected
df.withColumn("days_ago", f.datediff(start=df.date, end=df.Last_Updated)).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|days_ago|
+-------+----------+---------+----------+-----------+--------------------+--------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|    1971|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|    1971|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|    1971|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|    1971|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|    1971|
+-------+----------+---------+----------+-----------+--------------------+--------+
only showing top 5 rows



In [0]:
df.withColumn("days_ago", f.datediff(start=df["date"], end=df["Last_Updated"])).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|days_ago|
+-------+----------+---------+----------+-----------+--------------------+--------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|    1971|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|    1971|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|    1971|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|    1971|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|    1971|
+-------+----------+---------+----------+-----------+--------------------+--------+
only showing top 5 rows



## add_months

In [0]:
help(f.add_months)

Help on function add_months in module pyspark.sql.functions:

add_months(start: 'ColumnOrName', months: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
    Returns the date that is `months` months after `start`
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08', 2)], ['dt', 'add'])
    >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
    [Row(next_month=datetime.date(2015, 5, 8))]
    >>> df.select(add_months(df.dt, df.add.cast('integer')).alias('next_month')).collect()
    [Row(next_month=datetime.date(2015, 6, 8))]



In [0]:
df.withColumn("next_month", f.add_months(f.col("date"), 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_month|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-09-16|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-09-16|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-09-16|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-09-16|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-09-16|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-09-16|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-09-16|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
df.withColumn("next_month", f.add_months("date", 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_month|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-09-16|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-09-16|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-09-16|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-09-16|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-09-16|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-09-16|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-09-16|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
df.withColumn("next_month", f.add_months(df.date, 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_month|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-09-16|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-09-16|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-09-16|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-09-16|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-09-16|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-09-16|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-09-16|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
df.withColumn("next_month", f.add_months(df["date"], 1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_month|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-09-16|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-09-16|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-09-16|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-09-16|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-09-16|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-09-16|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-09-16|
|3000895|2017-08-16|        1|           CLEANING|

In [0]:
df.withColumn("next_month", f.add_months(f.col("date"), -1)).show()

+-------+----------+---------+-------------------+-----------+--------------------+----------+
|     id|      date|store_nbr|             family|onpromotion|        Last_Updated|next_month|
+-------+----------+---------+-------------------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|         AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-07-16|
|3000889|2017-08-16|        1|          BABY CARE|          0|2023-01-08 17:37:...|2017-07-16|
|3000890|2017-08-16|        1|             BEAUTY|          2|2023-01-08 17:37:...|2017-07-16|
|3000891|2017-08-16|        1|          BEVERAGES|         20|2023-01-08 17:37:...|2017-07-16|
|3000892|2017-08-16|        1|              BOOKS|          0|2023-01-08 17:37:...|2017-07-16|
|3000893|2017-08-16|        1|       BREAD/BAKERY|         12|2023-01-08 17:37:...|2017-07-16|
|3000894|2017-08-16|        1|        CELEBRATION|          0|2023-01-08 17:37:...|2017-07-16|
|3000895|2017-08-16|        1|           CLEANING|

## months_between
Returns a ```float``` value. We are using ```round``` function to round the value provided by ```months_between```.

In [0]:
help(f.months_between)

Help on function months_between in module pyspark.sql.functions:

months_between(date1: 'ColumnOrName', date2: 'ColumnOrName', roundOff: bool = True) -> pyspark.sql.column.Column
    Returns number of months between dates date1 and date2.
    If date1 is later than date2, then the result is positive.
    A whole number is returned if both inputs have the same day of month or both are the last day
    of their respective months. Otherwise, the difference is calculated assuming 31 days per month.
    The result is rounded off to 8 digits unless `roundOff` is set to `False`.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])
    >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()
    [Row(months=3.94959677)]
    >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect()
    [Row(months=3.9495967741935485)]



In [0]:
help(f.round)

Help on function round in module pyspark.sql.functions:

round(col: 'ColumnOrName', scale: int = 0) -> pyspark.sql.column.Column
    Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0
    or at integral part when `scale` < 0.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect()
    [Row(r=3.0)]



In [0]:
df.withColumn("months_between", f.months_between(df.Last_Updated, df.date)).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|months_between|
+-------+----------+---------+----------+-----------+--------------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   64.76562052|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   64.76562052|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   64.76562052|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   64.76562052|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   64.76562052|
+-------+----------+---------+----------+-----------+--------------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("months_between", f.round(f.months_between(df.Last_Updated, df.date))).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|months_between|
+-------+----------+---------+----------+-----------+--------------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|          65.0|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|          65.0|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|          65.0|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|          65.0|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|          65.0|
+-------+----------+---------+----------+-----------+--------------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("months_between", f.round(f.months_between(df.Last_Updated, df.date), 2)).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|months_between|
+-------+----------+---------+----------+-----------+--------------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|         64.77|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|         64.77|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|         64.77|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|         64.77|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|         64.77|
+-------+----------+---------+----------+-----------+--------------------+--------------+
only showing top 5 rows



## next_day

In [0]:
help(f.next_day)

Help on function next_day in module pyspark.sql.functions:

next_day(date: 'ColumnOrName', dayOfWeek: str) -> pyspark.sql.column.Column
    Returns the first date which is later than the value of the date column.
    
    Day of the week parameter is case insensitive, and accepts:
        "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-07-27',)], ['d'])
    >>> df.select(next_day(df.d, 'Sun').alias('date')).collect()
    [Row(date=datetime.date(2015, 8, 2))]



In [0]:
df.withColumn("next_day", f.next_day(df.date, "Thu")).show(5)

+-------+----------+---------+----------+-----------+--------------------+----------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|  next_day|
+-------+----------+---------+----------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-17|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2017-08-17|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2017-08-17|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2017-08-17|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2017-08-17|
+-------+----------+---------+----------+-----------+--------------------+----------+
only showing top 5 rows



In [0]:
df.withColumn("next_day", f.next_day(df["date"], "Sun")).show(5)

+-------+----------+---------+----------+-----------+--------------------+----------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|  next_day|
+-------+----------+---------+----------+-----------+--------------------+----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-20|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2017-08-20|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2017-08-20|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2017-08-20|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2017-08-20|
+-------+----------+---------+----------+-----------+--------------------+----------+
only showing top 5 rows



## trunc and date_trunc funtions
These functions are perticulely useful when we want to get beginning of the year, week, month etc. These functions are perticulerly important for reporting.

In [0]:
help(f.trunc)

Help on function trunc in module pyspark.sql.functions:

trunc(date: 'ColumnOrName', format: str) -> pyspark.sql.column.Column
    Returns date truncated to the unit specified by the format.
    
    .. versionadded:: 1.5.0
    
    Parameters
    ----------
    date : :class:`~pyspark.sql.Column` or str
    format : str
        'year', 'yyyy', 'yy' to truncate by year,
        or 'month', 'mon', 'mm' to truncate by month
        Other options are: 'week', 'quarter'
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28',)], ['d'])
    >>> df.select(trunc(df.d, 'year').alias('year')).collect()
    [Row(year=datetime.date(1997, 1, 1))]
    >>> df.select(trunc(df.d, 'mon').alias('month')).collect()
    [Row(month=datetime.date(1997, 2, 1))]



In [0]:
help(f.date_trunc)

Help on function date_trunc in module pyspark.sql.functions:

date_trunc(format: str, timestamp: 'ColumnOrName') -> pyspark.sql.column.Column
    Returns timestamp truncated to the unit specified by the format.
    
    .. versionadded:: 2.3.0
    
    Parameters
    ----------
    format : str
        'year', 'yyyy', 'yy' to truncate by year,
        'month', 'mon', 'mm' to truncate by month,
        'day', 'dd' to truncate by day,
        Other options are:
        'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter'
    timestamp : :class:`~pyspark.sql.Column` or str
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t'])
    >>> df.select(date_trunc('year', df.t).alias('year')).collect()
    [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]
    >>> df.select(date_trunc('mon', df.t).alias('month')).collect()
    [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]



In [0]:
df.withColumn("first_day_month", f.trunc(df["date"], "MM")).show(5)

+-------+----------+---------+----------+-----------+--------------------+---------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_month|
+-------+----------+---------+----------+-----------+--------------------+---------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|     2017-08-01|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|     2017-08-01|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|     2017-08-01|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|     2017-08-01|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|     2017-08-01|
+-------+----------+---------+----------+-----------+--------------------+---------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_week", f.trunc(df["date"], "week")).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_week|
+-------+----------+---------+----------+-----------+--------------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|    2017-08-14|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|    2017-08-14|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|    2017-08-14|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|    2017-08-14|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|    2017-08-14|
+-------+----------+---------+----------+-----------+--------------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_quarter", f.trunc(df["date"], "quarter")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-----------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_quarter|
+-------+----------+---------+----------+-----------+--------------------+-----------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|       2017-07-01|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|       2017-07-01|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|       2017-07-01|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|       2017-07-01|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|       2017-07-01|
+-------+----------+---------+----------+-----------+--------------------+-----------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_month", f.trunc(df["Last_Updated"], "MM")).show(5)

+-------+----------+---------+----------+-----------+--------------------+---------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_month|
+-------+----------+---------+----------+-----------+--------------------+---------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|     2023-01-01|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|     2023-01-01|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|     2023-01-01|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|     2023-01-01|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|     2023-01-01|
+-------+----------+---------+----------+-----------+--------------------+---------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_year", f.trunc(df["Last_Updated"], "yy")).show(5)

+-------+----------+---------+----------+-----------+--------------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_year|
+-------+----------+---------+----------+-----------+--------------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|    2023-01-01|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|    2023-01-01|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|    2023-01-01|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|    2023-01-01|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|    2023-01-01|
+-------+----------+---------+----------+-----------+--------------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_quarter", f.trunc(df["Last_Updated"], "quarter")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-----------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|first_day_quarter|
+-------+----------+---------+----------+-----------+--------------------+-----------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|       2023-01-01|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|       2023-01-01|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|       2023-01-01|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|       2023-01-01|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|       2023-01-01|
+-------+----------+---------+----------+-----------+--------------------+-----------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_month", f.date_trunc("month", "date")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|    first_day_month|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_month", f.date_trunc("mm", "date")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|    first_day_month|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2017-08-01 00:00:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2017-08-01 00:00:00|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



In [0]:
df.withColumn("first_day_week", f.date_trunc("week", "date")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|     first_day_week|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2017-08-14 00:00:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2017-08-14 00:00:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2017-08-14 00:00:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2017-08-14 00:00:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2017-08-14 00:00:00|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



In [0]:
df.withColumn("hour", f.date_trunc("hour", "Last_Updated")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|               hour|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2023-01-08 17:00:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2023-01-08 17:00:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2023-01-08 17:00:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2023-01-08 17:00:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2023-01-08 17:00:00|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



In [0]:
df.withColumn("min", f.date_trunc("minute", "Last_Updated")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|                min|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2023-01-08 17:37:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2023-01-08 17:37:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2023-01-08 17:37:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2023-01-08 17:37:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2023-01-08 17:37:00|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



In [0]:
df.withColumn("min", f.date_trunc("second", "Last_Updated")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|                min|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2023-01-08 17:37:25|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2023-01-08 17:37:25|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2023-01-08 17:37:25|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2023-01-08 17:37:25|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|2023-01-08 17:37:25|
+-------+----------+---------+----------+-----------+--------------------+-------------------+
only showing top 5 rows



## Extract various data and time info

In [0]:
df.withColumn("year", f.year(df["Last_Updated"]))\
.withColumn("month", f.month(df["Last_Updated"]))\
.withColumn("day", f.dayofmonth(df["Last_Updated"]))\
.withColumn("day of week", f.dayofweek(df["Last_Updated"]))\
.withColumn("day of year", f.dayofyear(df["Last_Updated"]))\
.withColumn("week of year", f.weekofyear(df["Last_Updated"]))\
.withColumn("hour", f.hour(df["Last_Updated"]))\
.withColumn("min", f.minute(df["Last_Updated"]))\
.withColumn("sec", f.second(df["Last_Updated"]))\
.show(5)

+-------+----------+---------+----------+-----------+--------------------+----+-----+---+-----------+-----------+------------+----+---+---+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|year|month|day|day of week|day of year|week of year|hour|min|sec|
+-------+----------+---------+----------+-----------+--------------------+----+-----+---+-----------+-----------+------------+----+---+---+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|2023|    1|  8|          1|          8|           1|  17| 37| 25|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|2023|    1|  8|          1|          8|           1|  17| 37| 25|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|2023|    1|  8|          1|          8|           1|  17| 37| 25|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|2023|    1|  8|          1|          8|           1|  17| 37| 25|
|3000892|2017-08-16|

## to_date and to_timestamp
Ref: https://www.digitalocean.com/community/tutorials/java-simpledateformat-java-date-format

In [0]:
help(f.to_timestamp)

Help on function to_timestamp in module pyspark.sql.functions:

to_timestamp(col: 'ColumnOrName', format: Optional[str] = None) -> pyspark.sql.column.Column
    Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType`
    using the optionally specified format. Specify formats according to `datetime pattern`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
    is omitted. Equivalent to ``col.cast("timestamp")``.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 2.2.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t).alias('dt')).collect()
    [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
    
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()
 

In [0]:
help(f.to_date)

Help on function to_date in module pyspark.sql.functions:

to_date(col: 'ColumnOrName', format: Optional[str] = None) -> pyspark.sql.column.Column
    Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`
    using the optionally specified format. Specify formats according to `datetime pattern`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
    is omitted. Equivalent to ``col.cast("date")``.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 2.2.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t).alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]
    
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]

##### Adding a column for demo

In [0]:
df = df.withColumn("constant_date", f.lit("2022-05-22"))

In [0]:
df.dtypes

Out[183]: [('id', 'string'),
 ('date', 'date'),
 ('store_nbr', 'string'),
 ('family', 'string'),
 ('onpromotion', 'string'),
 ('Last_Updated', 'timestamp'),
 ('constant_date', 'string')]

In [0]:
df.withColumn("constant_date_converted", f.to_date("constant_date","yyyy-mm-dd")).dtypes

Out[184]: [('id', 'string'),
 ('date', 'date'),
 ('store_nbr', 'string'),
 ('family', 'string'),
 ('onpromotion', 'string'),
 ('Last_Updated', 'timestamp'),
 ('constant_date', 'string'),
 ('constant_date_converted', 'date')]

In [0]:
df.withColumn("constant_date_converted", f.to_date("constant_date","yyyy-mm-dd")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+-----------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|constant_date_converted|
+-------+----------+---------+----------+-----------+--------------------+-------------+-----------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|             2022-01-22|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|             2022-01-22|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|             2022-01-22|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|             2022-01-22|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|             2022-01-22|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------

In [0]:
df.withColumn("constant_date_converted", f.to_timestamp("constant_date","yyyy-mm-dd")).dtypes

Out[186]: [('id', 'string'),
 ('date', 'date'),
 ('store_nbr', 'string'),
 ('family', 'string'),
 ('onpromotion', 'string'),
 ('Last_Updated', 'timestamp'),
 ('constant_date', 'string'),
 ('constant_date_converted', 'timestamp')]

In [0]:
df.withColumn("constant_date_converted", f.to_timestamp("constant_date","yyyy-mm-dd")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+-----------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|constant_date_converted|
+-------+----------+---------+----------+-----------+--------------------+-------------+-----------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|    2022-01-22 00:05:00|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|    2022-01-22 00:05:00|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|    2022-01-22 00:05:00|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|    2022-01-22 00:05:00|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|    2022-01-22 00:05:00|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------

## date_format
Converting date/timestamp to a desired date format

In [0]:
help(f.date_format)

Help on function date_format in module pyspark.sql.functions:

date_format(date: 'ColumnOrName', format: str) -> pyspark.sql.column.Column
    Converts a date/timestamp/string to a value of string in the format specified by the date
    format given by the second argument.
    
    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    pattern letters of `datetime pattern`_. can be used.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 1.5.0
    
    Notes
    -----
    Whenever possible, use specialized functions like `year`.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
    [Row(date='04/08/2015')]



In [0]:
df.withColumn("date_formatted", f.date_format("Last_Updated","yyyy")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|date_formatted|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|          2023|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|          2023|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|          2023|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|          2023|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|          2023|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("date_formatted", f.date_format("Last_Updated","yyyyMMdd")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|date_formatted|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|      20230108|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|      20230108|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|      20230108|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|      20230108|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|      20230108|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
only showing top 5 rows



In [0]:
df.withColumn("date_formatted", f.date_format("Last_Updated","HH")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|date_formatted|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|            17|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|            17|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|            17|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|            17|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|            17|
+-------+----------+---------+----------+-----------+--------------------+-------------+--------------+
only showing top 5 rows



## unix_timestamp and from_unixtime

In [0]:
help(f.unix_timestamp)

Help on function unix_timestamp in module pyspark.sql.functions:

unix_timestamp(timestamp: Optional[ForwardRef('ColumnOrName')] = None, format: str = 'yyyy-MM-dd HH:mm:ss') -> pyspark.sql.column.Column
    Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default)
    to Unix time stamp (in seconds), using the default timezone and the default
    locale, return null if fail.
    
    if `timestamp` is None, then it returns current timestamp.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
    >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
    [Row(unix_time=1428476400)]
    >>> spark.conf.unset("spark.sql.session.timeZone")



In [0]:
help(f.from_unixtime)

Help on function from_unixtime in module pyspark.sql.functions:

from_unixtime(timestamp: 'ColumnOrName', format: str = 'yyyy-MM-dd HH:mm:ss') -> pyspark.sql.column.Column
    Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
    representing the timestamp of that moment in the current system time zone in the given
    format.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
    >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
    >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
    [Row(ts='2015-04-08 00:00:00')]
    >>> spark.conf.unset("spark.sql.session.timeZone")



In [0]:
df.withColumn("unix", f.unix_timestamp("Last_Updated")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+----------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|      unix|
+-------+----------+---------+----------+-----------+--------------------+-------------+----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|1673199448|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|1673199448|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|1673199448|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|1673199448|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|1673199448|
+-------+----------+---------+----------+-----------+--------------------+-------------+----------+
only showing top 5 rows



In [0]:
df.withColumn("Unix", f.lit(1673199219)).withColumn("unix_date", f.from_unixtime("Unix")).show(5)

+-------+----------+---------+----------+-----------+--------------------+-------------+----------+-------------------+
|     id|      date|store_nbr|    family|onpromotion|        Last_Updated|constant_date|      Unix|          unix_date|
+-------+----------+---------+----------+-----------+--------------------+-------------+----------+-------------------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|2023-01-08 17:37:...|   2022-05-22|1673199219|2023-01-08 17:33:39|
|3000889|2017-08-16|        1| BABY CARE|          0|2023-01-08 17:37:...|   2022-05-22|1673199219|2023-01-08 17:33:39|
|3000890|2017-08-16|        1|    BEAUTY|          2|2023-01-08 17:37:...|   2022-05-22|1673199219|2023-01-08 17:33:39|
|3000891|2017-08-16|        1| BEVERAGES|         20|2023-01-08 17:37:...|   2022-05-22|1673199219|2023-01-08 17:33:39|
|3000892|2017-08-16|        1|     BOOKS|          0|2023-01-08 17:37:...|   2022-05-22|1673199219|2023-01-08 17:33:39|
+-------+----------+---------+----------