In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession.\
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

spark

In [2]:
orders_df = spark.read.format("csv")\
.option("header","true")\
.option("inferSchema","true")\
.load("/public/trendytech/orders_wh/*")

In [3]:
orders_df.show(truncate=0)

+--------+---------------------+-----------+---------------+
|order_id|order_date           |customer_id|order_status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [5]:
transformed_df1 = orders_df.withColumnRenamed("order_status","status")

In [6]:
from pyspark.sql.functions import *
transformed_df2 = transformed_df1.withColumn("order_date_new",to_timestamp("order_date"))

In [7]:
transformed_df2.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date_new: timestamp (nullable = true)



In [8]:
fin_df = transformed_df2.select("order_id","customer_id","status","order_date_new")\
.withColumnRenamed("order_date_new","order_date ")

In [9]:
fin_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date : timestamp (nullable = true)



In [10]:
orders_df=spark.read.\
format("csv").\
option("header","true").\
option("inferSchema","true").\
load("/public/trendytech/orders_wh/*")

In [11]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [12]:
orders_df2=spark.read\
.csv("/public/trendytech/orders_wh/*",header=True,inferSchema=True)

In [13]:
orders_df2.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [14]:
orders_df_json=spark.read\
.json("/public/trendytech/datasets/orders.json")
orders_df_json.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [15]:
orders_df_parquet = spark.read\
.parquet("/public/trendytech/datasets/ordersparquet/")
orders_df_parquet.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [16]:
orders_df_orc = spark.read\
.orc("/public/trendytech/datasets/ordersorc/")
orders_df_orc.show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      

In [17]:
filtered_df=orders_df_orc.where("customer_id=11599")
#or
#orders_df_orc.filter("customer_id=11599")
filtered_df.show(truncate=0)

+-----------+---------------------+--------+------------+
|customer_id|order_date           |order_id|order_status|
+-----------+---------------------+--------+------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED      |
|11599      |2013-10-03 00:00:00.0|11397   |COMPLETE    |
|11599      |2013-12-20 00:00:00.0|23908   |COMPLETE    |
|11599      |2014-06-27 00:00:00.0|53545   |PENDING     |
|11599      |2013-10-17 00:00:00.0|59911   |PROCESSING  |
+-----------+---------------------+--------+------------+



In [18]:
orders_df_orc.createOrReplaceTempView("orders") #df to a table/view conversion

In [19]:
filtered_df=spark.sql("""select * from orders where order_status='CLOSED'""")
filtered_df.show(truncate=0)

+-----------+---------------------+--------+------------+
|customer_id|order_date           |order_id|order_status|
+-----------+---------------------+--------+------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED      |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED      |
|1837       |2013-07-25 00:00:00.0|12      |CLOSED      |
|1205       |2013-07-25 00:00:00.0|18      |CLOSED      |
|11441      |2013-07-25 00:00:00.0|24      |CLOSED      |
|9503       |2013-07-25 00:00:00.0|25      |CLOSED      |
|5863       |2013-07-25 00:00:00.0|37      |CLOSED      |
|12271      |2013-07-25 00:00:00.0|51      |CLOSED      |
|7073       |2013-07-25 00:00:00.0|57      |CLOSED      |
|4791       |2013-07-25 00:00:00.0|61      |CLOSED      |
|9111       |2013-07-25 00:00:00.0|62      |CLOSED      |
|3065       |2013-07-25 00:00:00.0|87      |CLOSED      |
|9131       |2013-07-25 00:00:00.0|90      |CLOSED      |
|5116       |2013-07-25 00:00:00.0|101     |CLOSED      |
|8763       |2

In [20]:
#from spark table to df

spark_tb_df = spark.read.table("orders")
spark_tb_df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [21]:
spark.sql("create database if not exists 009490_retail").show()

++
||
++
++



In [22]:
spark.sql("show databases ").filter("namespace like '009490%'").show()

+-------------+
|    namespace|
+-------------+
|009490_retail|
+-------------+



In [23]:
spark.sql("use 009490_retail ").show()

++
||
++
++



In [24]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   orders|       true|
+--------+---------+-----------+



In [25]:
spark.sql("""CREATE table if not exists 009490_retail.orders (order_id INT, order_date STRING, customer_id INT, order_status STRING)
""").show()

++
||
++
++



In [34]:
spark.sql("show tables").show()

+-------------+----------+-----------+
|     database| tableName|isTemporary|
+-------------+----------+-----------+
|009490_retail|orders_ext|      false|
|             |    orders|       true|
+-------------+----------+-----------+



In [27]:
spark.sql("""insert into 009490_retail.orders select * from orders""")

In [28]:
spark.sql("""select * from 009490_retail.orders limit 5""").show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|    8702|2014-02-23 00:00:...|      34565|       COMPLETE|
|    3066|2014-02-23 00:00:...|      34566|PENDING_PAYMENT|
|    7314|2014-02-23 00:00:...|      34567|SUSPECTED_FRAUD|
|    1271|2014-02-23 00:00:...|      34568|       COMPLETE|
|   11083|2014-02-23 00:00:...|      34569|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [29]:
spark.sql("show create table 009490_retail.orders").show(truncate=0)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                      |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE `009490_retail`.`orders` (
  `order_id` INT,
  `order_date` STRING,
  `customer_id` INT,
  `order_status` STRING)
USING text
TBLPROPERTIES (
  'transient_lastDdlTime' = '1698521421')
|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [30]:
spark.sql("desc extended  009490_retail.orders").show(truncate=0)

+----------------------------+------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                     |comment|
+----------------------------+------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                           |null   |
|order_date                  |string                                                                        |null   |
|customer_id                 |int                                                                           |null   |
|order_status                |string                                                                        |null   |
|                            |                                                                              |       |
|# Detailed Table Information|                          

In [31]:
spark.sql("drop table 009490_retail.orders ")

In [33]:
spark.sql("""CREATE table if not exists 009490_retail.orders_ext (order_id INT, order_date STRING, customer_id INT, order_status STRING)
using csv location '/public/trendytech/retail_db/orders'""").show()

++
||
++
++



In [35]:
spark.sql("""desc formatted 009490_retail.orders_ext""").show(truncate=0)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|order_id                    |int                                                             |null   |
|order_date                  |string                                                          |null   |
|customer_id                 |int                                                             |null   |
|order_status                |string                                                          |null   |
|                            |                                                                |       |
|# Detailed Table Information|                                                                |       |
|Database                    |009490_retail                     

In [36]:
spark.sql("""truncate table 009490_retail.orders_ext""").show(truncate=0)

AnalysisException: Operation not allowed: TRUNCATE TABLE on external tables: `009490_retail`.`orders_ext`