In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

username

'nghiaht7'

In [4]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port', '4444'). \
    config("spark.sql.warehouse.dir", "hdfs://0.0.0.0:9000/user/hive/warehouse/"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [5]:
spark

In [6]:
spark.read.csv?

In [8]:
# help(spark.read.csv)

In [15]:
orders = spark. \
    read. \
    csv('/home/nghiaht7/data-engineer/data-engineering-essentials/retail_db/orders',
        schema='''
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
        '''
       )

In [16]:
orders.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

In [14]:
!hdfs dfs -ls -h /user/hive/warehouse/retail_db.db/orders/part-00000

-rwxr-xrwx   1 hive hadoop      2.9 M 2021-08-24 17:16 /user/hive/warehouse/retail_db.db/orders/part-00000


In [18]:
type(orders)

pyspark.sql.dataframe.DataFrame

In [31]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [33]:
orders_infer = spark. \
    read. \
    csv('/home/nghiaht7/data-engineer/data-engineering-essentials/retail_db/orders',
        header=True,
        inferSchema=True
       )
orders.printSchema()

[Stage 7:>                                                          (0 + 1) / 1]

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



                                                                                

In [35]:
orders.count()

68883

In [36]:
orders.distinct().count()

                                                                                

68883

In [37]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [38]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING"""
                   )

In [39]:
employeesDF.show()

[Stage 14:>                                                         (0 + 1) / 1]                                                                                

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [41]:
employeesDF.select("first_name", "last_name")

first_name,last_name
Scott,Tiger
Henry,Ford
Nick,Junior
Bill,Gomes


In [45]:
# immutable

employeesDF. \
    drop("nationality"). \
    show()

+-----------+----------+---------+------+
|employee_id|first_name|last_name|salary|
+-----------+----------+---------+------+
|          1|     Scott|    Tiger|1000.0|
|          2|     Henry|     Ford|1250.0|
|          3|      Nick|   Junior| 750.0|
|          4|      Bill|    Gomes|1500.0|
+-----------+----------+---------+------+



In [46]:
employeesDF.show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [47]:
from pyspark.sql.functions import *

employeesDF. \
    withColumn('full_name', concat('first_name', lit(' '), 'last_name')). \
    show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+



In [48]:
employeesDF.selectExpr('*', 'concat(first_name, " ", last_name) AS full_name').show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+



In [51]:
# employeesDF. \
#     withColumn("full_name", concat("first_name", ", ", "last_name")). \
#     drop("first_name", "last_name"). \
#     show()

In [50]:
employeesDF. \
    withColumn("full_name", concat("first_name", lit(", "), "last_name")). \
    drop("first_name", "last_name"). \
    show()

+-----------+------+--------------+------------+
|employee_id|salary|   nationality|   full_name|
+-----------+------+--------------+------------+
|          1|1000.0| united states|Scott, Tiger|
|          2|1250.0|         India| Henry, Ford|
|          3| 750.0|united KINGDOM|Nick, Junior|
|          4|1500.0|     AUSTRALIA| Bill, Gomes|
+-----------+------+--------------+------------+



In [52]:
employeesDF. \
    select("employee_id",
           concat("first_name", lit(", "), "last_name"),
           "salary",
           "nationality"
          ). \
    show()

+-----------+---------------------------------+------+--------------+
|employee_id|concat(first_name, , , last_name)|salary|   nationality|
+-----------+---------------------------------+------+--------------+
|          1|                     Scott, Tiger|1000.0| united states|
|          2|                      Henry, Ford|1250.0|         India|
|          3|                     Nick, Junior| 750.0|united KINGDOM|
|          4|                      Bill, Gomes|1500.0|     AUSTRALIA|
+-----------+---------------------------------+------+--------------+



In [54]:
employeesDF. \
    select("employee_id",
           concat("first_name", lit(", "), "last_name").alias("full_name"),
           "salary",
           "nationality"
          ). \
    show()

+-----------+------------+------+--------------+
|employee_id|   full_name|salary|   nationality|
+-----------+------------+------+--------------+
|          1|Scott, Tiger|1000.0| united states|
|          2| Henry, Ford|1250.0|         India|
|          3|Nick, Junior| 750.0|united KINGDOM|
|          4| Bill, Gomes|1500.0|     AUSTRALIA|
+-----------+------------+------+--------------+



In [56]:
spark.sql('Show databases').show()

+---------+
|namespace|
+---------+
|  default|
|retail_db|
+---------+



In [58]:
orders. \
    write. \
    parquet('/home/nghiaht7/data-engineer/data-engineering-essentials/data/orders_write', 
            mode='overwrite', 
            compression='none'
           )

21/08/25 16:57:38 WARN DataStreamer: Exception for BP-1328728289-192.168.1.11-1629794774508:blk_1073742166_1343
java.net.SocketTimeoutException: 65000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/192.168.1.11:49120 remote=/192.168.1.11:9866]
	at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)
	at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)
	at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131)
	at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118)
	at java.io.FilterInputStream.read(FilterInputStream.java:83)
	at java.io.FilterInputStream.read(FilterInputStream.java:83)
	at org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)
	at org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck.readFields(PipelineAck.java:213)
	at org.apache.hadoop.hdfs.DataStreamer$ResponseProcessor.run(Dat

In [60]:
orders. \
    write. \
    mode('overwrite'). \
    option('compression', 'none'). \
    format('parquet'). \
    save('/home/nghiaht7/data-engineer/data-engineering-essentials/data/orders_write')

21/08/25 17:01:19 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.io.IOException: All datanodes [DatanodeInfoWithStorage[192.168.1.11:9866,DS-0afc7d69-3357-47bc-87d1-75a440270c7c,DISK]] are bad. Aborting...
	at org.apache.hadoop.hdfs.DataStreamer.handleBadDatanode(DataStreamer.java:1561)
	at org.apache.hadoop.hdfs.DataStreamer.setupPipelineInternal(DataStreamer.java:1495)
	at org.apache.hadoop.hdfs.DataStreamer.setupPipelineForAppendOrRecovery(DataStreamer.java:1481)
	at org.apache.hadoop.hdfs.DataStreamer.processDatanodeOrExternalError(DataStreamer.java:1256)
	at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:667)
21/08/25 17:01:19 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.io.IOException: All datanodes [DatanodeInfoWithStorage[192.168.1.11:9866,DS-0afc7d69-3357-47bc-87d1-75a440270c7c,DISK]] are bad. Aborting...
	at org.apache.hadoop.hdfs.DataStreamer.handleBadDatanode(DataStreamer.java:1561)
	at org.apache.hado

In [63]:
orders. \
    coalesce(1). \
    write. \
    csv('/home/nghiaht7/data-engineer/data-engineering-essentials/data/orders_csv_gzip',
        sep='|',
        mode='overwrite',
        compression='gzip'
       )

21/08/25 17:04:04 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.io.IOException: All datanodes [DatanodeInfoWithStorage[192.168.1.11:9866,DS-0afc7d69-3357-47bc-87d1-75a440270c7c,DISK]] are bad. Aborting...
	at org.apache.hadoop.hdfs.DataStreamer.handleBadDatanode(DataStreamer.java:1561)
	at org.apache.hadoop.hdfs.DataStreamer.setupPipelineInternal(DataStreamer.java:1495)
	at org.apache.hadoop.hdfs.DataStreamer.setupPipelineForAppendOrRecovery(DataStreamer.java:1481)
	at org.apache.hadoop.hdfs.DataStreamer.processDatanodeOrExternalError(DataStreamer.java:1256)
	at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:667)
21/08/25 17:04:04 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.io.IOException: All datanodes [DatanodeInfoWithStorage[192.168.1.11:9866,DS-0afc7d69-3357-47bc-87d1-75a440270c7c,DISK]] are bad. Aborting...
	at org.apache.hadoop.hdfs.DataStreamer.handleBadDatanode(DataStreamer.java:1561)
	at org.apache.hado