## Date and timestamp functions in PySpark

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("PySpark_Practice_02").getOrCreate()

In [4]:
from pyspark.sql.functions import current_timestamp, to_timestamp
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DateType, BooleanType

In [5]:
df = spark.createDataFrame([["1", "2019-07-01 12:01:19.000"],
                           ["2", "2019-06-24 12:01:19.000"]],
                          ["id", "input_timestamp"])

In [11]:
df.show(truncate=False)

+---+-----------------------+
|id |input_timestamp        |
+---+-----------------------+
|1  |2019-07-01 12:01:19.000|
|2  |2019-06-24 12:01:19.000|
+---+-----------------------+



In [12]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- input_timestamp: string (nullable = true)



#### Converting string datatype to timestamp

In [15]:
df1 = df.withColumn("timestamptype",
                   to_timestamp("input_timestamp"))

df1.show(truncate=False)

+---+-----------------------+-------------------+
|id |input_timestamp        |timestamptype      |
+---+-----------------------+-------------------+
|1  |2019-07-01 12:01:19.000|2019-07-01 12:01:19|
|2  |2019-06-24 12:01:19.000|2019-06-24 12:01:19|
+---+-----------------------+-------------------+



In [16]:
## Selecting only necessary column and renaming
df2 = df1.select("id", "timestamptype").withColumnRenamed("timestamptype", "input_timestamp")
df2.show(truncate=False)

+---+-------------------+
|id |input_timestamp    |
+---+-------------------+
|1  |2019-07-01 12:01:19|
|2  |2019-06-24 12:01:19|
+---+-------------------+



In [18]:
# using cast to convert timestamp to DateType
df3 = df2.select(col("id"), col("input_timestamp").cast('string'))
df3.show(truncate=False)
df3.printSchema()

+---+-------------------+
|id |input_timestamp    |
+---+-------------------+
|1  |2019-07-01 12:01:19|
|2  |2019-06-24 12:01:19|
+---+-------------------+

root
 |-- id: string (nullable = true)
 |-- input_timestamp: string (nullable = true)



In [19]:
# timestamp type to datetype
df4 = df2.select(col("id"), to_date(col("input_timestamp")))
df4.show(truncate=False)

+---+------------------------+
|id |to_date(input_timestamp)|
+---+------------------------+
|1  |2019-07-01              |
|2  |2019-06-24              |
+---+------------------------+

