In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Practice_1').getOrCreate()

Method 1: Define a special schema containing column's name and column's data type


In [4]:
order_schema = "order_id long, order_date date, cust_id long, status string"

In [7]:
order_df = spark.read.format('csv')\
    .schema(order_schema) \
    .load('data/orders_sh.csv')

In [8]:
order_df.show(5)

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|         status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
+--------+----------+-------+---------------+
only showing top 5 rows



In [9]:
order_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- status: string (nullable = true)



Method 2: Cast string type to date type using by functions.to_date() or funtions.date_format()
- to_date('col_name', 'pattern') 
- ==> Note: correct pattern is 'yyyy-MM-dd', or skip this argument
- date_format('col_name', 'pattern')

In [32]:
from pyspark.sql.functions import to_date, date_format

In [12]:
order_schema2 = "order_id long, order_date string, cust_id long, status string"

In [43]:
order_df2 = spark.read.format('csv').schema(order_schema2).load('data/orders_sh.csv')

In [44]:
order_df2.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- status: string (nullable = true)



In [45]:
order_df2.show(5)

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|         status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
|       4|2013-07-25 00:00:...|   8827|         CLOSED|
|       5|2013-07-25 00:00:...|  11318|       COMPLETE|
+--------+--------------------+-------+---------------+
only showing top 5 rows



In [46]:
format_df = order_df2.withColumn('order_date', date_format('order_date', 'MM-dd-yyy'))

In [52]:
format_df2 = order_df2.withColumn('order_date', to_date('order_date'))

In [48]:
format_df.printSchema() # use date_format() --> cast type to string
format_df2.printSchema() # use to_date() --> cast type to date

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- status: string (nullable = true)

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- status: string (nullable = true)



In [49]:
format_df.show(5)

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|         status|
+--------+----------+-------+---------------+
|       1|07-25-2013|  11599|         CLOSED|
|       2|07-25-2013|    256|PENDING_PAYMENT|
|       3|07-25-2013|  12111|       COMPLETE|
|       4|07-25-2013|   8827|         CLOSED|
|       5|07-25-2013|  11318|       COMPLETE|
+--------+----------+-------+---------------+
only showing top 5 rows



In [53]:
format_df2.show(5, truncate=False)

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|status         |
+--------+----------+-------+---------------+
|1       |2013-07-25|11599  |CLOSED         |
|2       |2013-07-25|256    |PENDING_PAYMENT|
|3       |2013-07-25|12111  |COMPLETE       |
|4       |2013-07-25|8827   |CLOSED         |
|5       |2013-07-25|11318  |COMPLETE       |
+--------+----------+-------+---------------+
only showing top 5 rows

