In [1]:
%load_ext nb_mypy

Version 1.0.2


In [2]:
%nb_mypy On

In [3]:
from pyspark.sql import SparkSession, DataFrame, Window
import pyspark.sql.functions as F
from datetime import datetime
import pathlib
from __future__ import annotations
from typing import Union, List, Tuple, Optional, TypeVar, Any

In [4]:
import os.path
from os import path


def read_hard_drive_data(spark: SparkSession, data_location: str, schema: str) -> DataFrame:
    df = spark.read.csv(data_location, header='false', schema=schema)
    return df


In [5]:
def create_data_frame(spark: SparkSession, data: List[Tuple[Any, ...]], schema: str) -> DataFrame:
    df = spark.createDataFrame(data, schema)
    return df

In [6]:
from pyspark.sql.functions import date_format

def dateformat(df: DataFrame) -> DataFrame:
    df = df.select('*', date_format('order_date', 'yyyyMM').alias('order_month'))
    return df
    

In [7]:
import getpass
username = getpass.getuser()

username

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '4000'). \
    config("spark.sql.warehouse.dir", "hdfs://0.0.0.0:9000/user/hive/warehouse/"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
spark

In [9]:
orders_path = '/home/nghiaht7/data-engineer/data-engineering-essentials/retail_db/orders'
orders_schema = 'order_id INT, order_date STRING, order_customer_id INT, order_status STRING'


In [10]:
orders = read_hard_drive_data(spark, orders_path, orders_schema)

In [11]:
orders.show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

                                                                                

In [12]:
orders. \
    groupBy(date_format('order_date', 'yyyyMM').alias('order_month')). \
    count(). \
    show()



+-----------+-----+
|order_month|count|
+-----------+-----+
|     201401| 5908|
|     201405| 5467|
|     201312| 5892|
|     201310| 5335|
|     201311| 6381|
|     201307| 1533|
|     201407| 4468|
|     201403| 5778|
|     201404| 5657|
|     201402| 5635|
|     201309| 5841|
|     201406| 5308|
|     201308| 5680|
+-----------+-----+



                                                                                

In [13]:
orders: DataFrame = dateformat(orders)
orders.show()

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|order_month|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     201307|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|     201307|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|     201307|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|     201307|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|     201307|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|     201307|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|     201307|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|     201307|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|     201307|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT

In [14]:
employees: List[Tuple[Any, ...]] = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

employees_schema:str = """employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_number STRING, ssn STRING"""

In [15]:
employeesDF = create_data_frame(spark, employees, employees_schema)
employeesDF.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- ssn: string (nullable = true)



In [16]:
employeesDF. \
    select("first_name", "last_name"). \
    show()

[Stage 12:>                                                         (0 + 1) / 1]                                                                                

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Scott|    Tiger|
|     Henry|     Ford|
|      Nick|   Junior|
|      Bill|    Gomes|
+----------+---------+



[Stage 13:>                                                         (0 + 1) / 1]                                                                                

In [18]:
from pyspark.sql.functions import col, asc, desc, upper

employeesDF. \
    select(col("employee_id"),upper(col("first_name")).alias("first_name_upper"), col("last_name")). \
    orderBy(col("employee_id").desc()). \
    show()

+-----------+----------------+---------+
|employee_id|first_name_upper|last_name|
+-----------+----------------+---------+
|          4|            BILL|    Gomes|
|          3|            NICK|   Junior|
|          2|           HENRY|     Ford|
|          1|           SCOTT|    Tiger|
+-----------+----------------+---------+



In [19]:
from pyspark.sql.functions import concat, col, lit

employeesDF. \
    select(concat(col("first_name"), 
                  lit(", "), 
                  col("last_name")
                 ).alias("full_name")
          ). \
    show(truncate=False)

+------------+
|full_name   |
+------------+
|Scott, Tiger|
|Henry, Ford |
|Nick, Junior|
|Bill, Gomes |
+------------+



In [20]:
from pyspark.sql.functions import substring, col

employeesDF. \
    select("employee_id", "phone_number", "ssn"). \
    withColumn("phone_last4", substring(col("phone_number"), -4, 4).cast("int")). \
    withColumn("ssn_last4", substring(col("ssn"), 8, 4).cast("int")). \
    show()


+-----------+----------------+-----------+-----------+---------+
|employee_id|    phone_number|        ssn|phone_last4|ssn_last4|
+-----------+----------------+-----------+-----------+---------+
|          1| +1 123 456 7890|123 45 6789|       7890|     6789|
|          2|+91 234 567 8901|456 78 9123|       8901|     9123|
|          3|+44 111 111 1111|222 33 4444|       1111|     4444|
|          4|+61 987 654 3210|789 12 6118|       3210|     6118|
+-----------+----------------+-----------+-----------+---------+



In [21]:
employeesDF.show(5)

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [24]:
#padding string

from pyspark.sql.functions import lpad, rpad, concat

empFixedDF = employeesDF.select(
    concat(
        lpad("employee_id", 5, "0"), 
        rpad("first_name", 10, "-"), 
        rpad("last_name", 10, "-"),
        lpad("salary", 10, "0"), 
        rpad("nationality", 15, "-"), 
        rpad("phone_number", 17, "-"), 
        "ssn"
    ).alias("employee")
)
empFixedDF.show(truncate=False)

+------------------------------------------------------------------------------+
|employee                                                                      |
+------------------------------------------------------------------------------+
|00001Scott-----Tiger-----00001000.0united states--+1 123 456 7890--123 45 6789|
|00002Henry-----Ford------00001250.0India----------+91 234 567 8901-456 78 9123|
|00003Nick------Junior----00000750.0united KINGDOM-+44 111 111 1111-222 33 4444|
|00004Bill------Gomes-----00001500.0AUSTRALIA------+61 987 654 3210-789 12 6118|
+------------------------------------------------------------------------------+



In [25]:
datetimes: List[Tuple[Any, ...]] = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [26]:
datetimesDF = create_data_frame(spark, datetimes, schema="date STRING, time STRING")

datetimesDF.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



In [34]:
# adding 10 days to date, time
import pyspark.sql.functions as F

datetimesDF. \
    withColumn("date_add_date", F.date_add("date", 10)). \
    withColumn("date_add_time", F.date_add("time", 10)). \
    withColumn("date_sub_date", F.date_sub("date", 10)). \
    withColumn("date_sub_time", F.date_sub("time", 10)). \
    show()

+----------+--------------------+-------------+-------------+-------------+-------------+
|      date|                time|date_add_date|date_add_time|date_sub_date|date_sub_time|
+----------+--------------------+-------------+-------------+-------------+-------------+
|2014-02-28|2014-02-28 10:00:...|   2014-03-10|   2014-03-10|   2014-02-18|   2014-02-18|
|2016-02-29|2016-02-29 08:08:...|   2016-03-10|   2016-03-10|   2016-02-19|   2016-02-19|
|2017-10-31|2017-12-31 11:59:...|   2017-11-10|   2018-01-10|   2017-10-21|   2017-12-21|
|2019-11-30|2019-08-31 00:00:...|   2019-12-10|   2019-09-10|   2019-11-20|   2019-08-21|
+----------+--------------------+-------------+-------------+-------------+-------------+



In [36]:
datetimesDF. \
    withColumn("date_trunc", F.trunc("date", "MM")). \
    withColumn("time_trunc", F.trunc("time", "yy")). \
    show(truncate=False)

+----------+-----------------------+----------+----------+
|date      |time                   |date_trunc|time_trunc|
+----------+-----------------------+----------+----------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-01|2014-01-01|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-01|2016-01-01|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-01|2017-01-01|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-01|2019-01-01|
+----------+-----------------------+----------+----------+



In [39]:
from pyspark.sql.functions import year, month, weekofyear, dayofmonth, \
    dayofyear, dayofweek, current_date

df = spark.createDataFrame([("X", )]).toDF("dummy")

df.select(
    current_date().alias('current_date'), 
    year(current_date()).alias('year'),
    month(current_date()).alias('month'),
    weekofyear(current_date()).alias('weekofyear'),
    dayofyear(current_date()).alias('dayofyear'),
    dayofmonth(current_date()).alias('dayofmonth'),
    dayofweek(current_date()).alias('dayofweek')
).show() #yyyy-MM-dd

+------------+----+-----+----------+---------+----------+---------+
|current_date|year|month|weekofyear|dayofyear|dayofmonth|dayofweek|
+------------+----+-----+----------+---------+----------+---------+
|  2021-08-25|2021|    8|        34|      237|        25|        4|
+------------+----+-----+----------+---------+----------+---------+



In [41]:
from pyspark.sql.functions import current_timestamp, hour, minute, second

df.select(
    current_timestamp().alias('current_timestamp'), 
    year(current_timestamp()).alias('year'),
    month(current_timestamp()).alias('month'),
    dayofmonth(current_timestamp()).alias('dayofmonth'),
    hour(current_timestamp()).alias('hour'),
    minute(current_timestamp()).alias('minute'),
    second(current_timestamp()).alias('second')
).show(truncate=False) #yyyy-MM-dd HH:mm:ss.SSS

+----------------------+----+-----+----------+----+------+------+
|current_timestamp     |year|month|dayofmonth|hour|minute|second|
+----------------------+----+-----+----------+----+------+------+
|2021-08-25 21:04:16.46|2021|8    |25        |21  |4     |16    |
+----------------------+----+-----+----------+----+------+------+



In [46]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [47]:



employees = create_data_frame(spark, employees, schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING""")

employees.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [49]:
from pyspark.sql.functions import lit, coalesce

employees. \
    withColumn('bonus_filled_0', coalesce(col('bonus').cast('int'), lit(0))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus_filled_0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|            10|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|             0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|             0|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|            10|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+--------------+



In [52]:


employees. \
    withColumn(
        'bonus_case_when', 
        F.expr("""
            CASE WHEN bonus IS NULL OR bonus = '' THEN 0
            ELSE bonus
            END
            """)
    ). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+---------------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus_case_when|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+---------------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|             10|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|              0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|              0|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|             10|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+---------------+

