### reading csv files

In [0]:
%fs ls '/exam_prep/retail_db'

path,name,size,modificationTime
dbfs:/exam_prep/retail_db/README.md,README.md,806,1720271879000
dbfs:/exam_prep/retail_db/categories/,categories/,0,1720271884000
dbfs:/exam_prep/retail_db/create_db.sql,create_db.sql,10303297,1720271883000
dbfs:/exam_prep/retail_db/create_db_tables_pg.sql,create_db_tables_pg.sql,1748,1720271879000
dbfs:/exam_prep/retail_db/customers/,customers/,0,1720271878000
dbfs:/exam_prep/retail_db/departments/,departments/,0,1720271879000
dbfs:/exam_prep/retail_db/load_db_tables_pg.sql,load_db_tables_pg.sql,10297372,1720271886000
dbfs:/exam_prep/retail_db/order_items/,order_items/,0,1720271880000
dbfs:/exam_prep/retail_db/orders/,orders/,0,1720271883000
dbfs:/exam_prep/retail_db/products/,products/,0,1720271879000


In [0]:
%fs ls '/exam_prep/retail_db/orders'

path,name,size,modificationTime
dbfs:/exam_prep/retail_db/orders/part-00000,part-00000,2999944,1720271883000


In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
orders = spark.read.schema(schema=schema).csv('/exam_prep/retail_db/orders')

In [0]:
orders.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [0]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### reading json files

In [0]:
%fs ls '/exam_prep/retail_db_json/orders'

path,name,size,modificationTime
dbfs:/exam_prep/retail_db_json/orders/part-r-00000-990f5773-9005-49ba-b670-631286032674,part-r-00000-990f5773-9005-49ba-b670-631286032674,7477339,1720271868000


In [0]:
orders = spark.read.json('dbfs:/exam_prep/retail_db_json/orders/part-r-00000-990f5773-9005-49ba-b670-631286032674', schema=schema)
orders.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [0]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### can also read files with the `format` followed by `load` syntax

In [0]:
orders_json = spark.read.format("json").load(
    "dbfs:/exam_prep/retail_db_json/orders/part-r-00000-990f5773-9005-49ba-b670-631286032674",
    schema=schema,
)

orders_json.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [0]:
orders_csv = spark.read.format('csv').load('/exam_prep/retail_db/orders', schema=schema)
orders_csv.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [0]:
orders_csv_v2 = spark.read.schema(schema=schema).format('csv').load('/exam_prep/retail_db/orders')

orders_csv_v2.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [0]:
orders_csv_v2.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)

