In [0]:
dbutils.fs.rm('/exam_prep/courses_db', recurse=True)

True

In [0]:
from pyspark.sql import Row

import datetime


courses = [
    {
        'course_id': 1,
        'course_title': 'Mastering Python',
        'course_published_dt': datetime.date(2021, 1, 14),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 2, 18, 16, 57, 25)
    },
    {
        'course_id': 2,
        'course_title': 'Data Engineering Essentials',
        'course_published_dt': datetime.date(2021, 2, 10),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 3, 5, 12, 7, 33)
    },
    {
        'course_id': 3,
        'course_title': 'Mastering Pyspark',
        'course_published_dt': datetime.date(2021, 1, 7),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 4, 6, 10, 5, 42)
    },
    {
        'course_id': 4,
        'course_title': 'AWS Essentials',
        'course_published_dt': datetime.date(2021, 3, 19),
        'is_active': False,
        'last_updated_ts': datetime.datetime(2021, 4, 10, 2, 25, 36)
    },
    {
        'course_id': 5,
        'course_title': 'Docker 101',
        'course_published_dt': datetime.date(2021, 2, 28),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 3, 21, 7, 18, 52)
    }
]

courses_df = spark.createDataFrame([Row(**course) for course in courses])

In [0]:
courses_df.show(5)

+---------+--------------------+-------------------+---------+-------------------+
|course_id|        course_title|course_published_dt|is_active|    last_updated_ts|
+---------+--------------------+-------------------+---------+-------------------+
|        1|    Mastering Python|         2021-01-14|     true|2021-02-18 16:57:25|
|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05 12:07:33|
|        3|   Mastering Pyspark|         2021-01-07|     true|2021-04-06 10:05:42|
|        4|      AWS Essentials|         2021-03-19|    false|2021-04-10 02:25:36|
|        5|          Docker 101|         2021-02-28|     true|2021-03-21 07:18:52|
+---------+--------------------+-------------------+---------+-------------------+



### write file and specify compression type

In [0]:
courses_df.write.format("csv").save(
    "/exam_prep/courses_db", mode="overwrite", header=True, compression="gzip"
)

### creates >1 compressed csv files

In [0]:
%fs ls '/exam_prep/courses_db'

path,name,size,modificationTime
dbfs:/exam_prep/courses_db/_SUCCESS,_SUCCESS,0,1720504977000
dbfs:/exam_prep/courses_db/_committed_5620968563558300741,_committed_5620968563558300741,390,1720504977000
dbfs:/exam_prep/courses_db/_started_5620968563558300741,_started_5620968563558300741,0,1720504977000
dbfs:/exam_prep/courses_db/part-00000-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-98-1-c000.csv.gz,part-00000-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-98-1-c000.csv.gz,127,1720504977000
dbfs:/exam_prep/courses_db/part-00001-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-99-1-c000.csv.gz,part-00001-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-99-1-c000.csv.gz,132,1720504977000
dbfs:/exam_prep/courses_db/part-00002-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-100-1-c000.csv.gz,part-00002-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-100-1-c000.csv.gz,130,1720504977000
dbfs:/exam_prep/courses_db/part-00003-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-101-1-c000.csv.gz,part-00003-tid-5620968563558300741-30569954-ecc3-49ea-917c-aeef21916a53-101-1-c000.csv.gz,156,1720504977000


### while reading spark automatically loads the compressed data

(you don't need to uncompress it)

In [0]:
spark.read.csv('/exam_prep/courses_db', header=True).show()

+---------+--------------------+-------------------+---------+--------------------+
|course_id|        course_title|course_published_dt|is_active|     last_updated_ts|
+---------+--------------------+-------------------+---------+--------------------+
|        4|      AWS Essentials|         2021-03-19|    false|2021-04-10T02:25:...|
|        5|          Docker 101|         2021-02-28|     true|2021-03-21T07:18:...|
|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05T12:07:...|
|        3|   Mastering Pyspark|         2021-01-07|     true|2021-04-06T10:05:...|
|        1|    Mastering Python|         2021-01-14|     true|2021-02-18T16:57:...|
+---------+--------------------+-------------------+---------+--------------------+

