In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [3]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [4]:
orders_df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
|      11|2013-07-25|    918| PAYMENT_REVIEW|
|      12|2013-07-25|   1837|         CLOSED|
|      13|2013-07-25|   9149|PENDING_PAYMENT|
|      14|2013-07-25|   9842|     PROCESSING|
|      15|2013-07-25|   2568|       COMPLETE|
|      16|2013-07-25|   7276|PENDING_PAYMENT|
|      17|2013-07-25|   2667|       COMPLETE|
|      18|2013-07-25|   1205|         CLOSED|
|      19|2013-07-25|   9488|PENDI

In [5]:
orders_df.select("*").show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
|      11|2013-07-25|    918| PAYMENT_REVIEW|
|      12|2013-07-25|   1837|         CLOSED|
|      13|2013-07-25|   9149|PENDING_PAYMENT|
|      14|2013-07-25|   9842|     PROCESSING|
|      15|2013-07-25|   2568|       COMPLETE|
|      16|2013-07-25|   7276|PENDING_PAYMENT|
|      17|2013-07-25|   2667|       COMPLETE|
|      18|2013-07-25|   1205|         CLOSED|
|      19|2013-07-25|   9488|PENDI

In [6]:
orders_df.select("order_id","order_date").show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
|       3|2013-07-25|
|       4|2013-07-25|
|       5|2013-07-25|
|       6|2013-07-25|
|       7|2013-07-25|
|       8|2013-07-25|
|       9|2013-07-25|
|      10|2013-07-25|
|      11|2013-07-25|
|      12|2013-07-25|
|      13|2013-07-25|
|      14|2013-07-25|
|      15|2013-07-25|
|      16|2013-07-25|
|      17|2013-07-25|
|      18|2013-07-25|
|      19|2013-07-25|
|      20|2013-07-25|
+--------+----------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import *

In [8]:
orders_df.select("order_id",orders_df.order_date,orders_df['order_date'], column('cust_id'), col('cust_id'), expr("order_status")).show()

+--------+----------+----------+-------+-------+---------------+
|order_id|order_date|order_date|cust_id|cust_id|   order_status|
+--------+----------+----------+-------+-------+---------------+
|       1|2013-07-25|2013-07-25|  11599|  11599|         CLOSED|
|       2|2013-07-25|2013-07-25|    256|    256|PENDING_PAYMENT|
|       3|2013-07-25|2013-07-25|  12111|  12111|       COMPLETE|
|       4|2013-07-25|2013-07-25|   8827|   8827|         CLOSED|
|       5|2013-07-25|2013-07-25|  11318|  11318|       COMPLETE|
|       6|2013-07-25|2013-07-25|   7130|   7130|       COMPLETE|
|       7|2013-07-25|2013-07-25|   4530|   4530|       COMPLETE|
|       8|2013-07-25|2013-07-25|   2911|   2911|     PROCESSING|
|       9|2013-07-25|2013-07-25|   5657|   5657|PENDING_PAYMENT|
|      10|2013-07-25|2013-07-25|   5648|   5648|PENDING_PAYMENT|
|      11|2013-07-25|2013-07-25|    918|    918| PAYMENT_REVIEW|
|      12|2013-07-25|2013-07-25|   1837|   1837|         CLOSED|
|      13|2013-07-25|2013

In [10]:
orders_df.select("order_id", "cust_id", expr("cust_id + 1")).show()

+--------+-------+-------------+
|order_id|cust_id|(cust_id + 1)|
+--------+-------+-------------+
|       1|  11599|        11600|
|       2|    256|          257|
|       3|  12111|        12112|
|       4|   8827|         8828|
|       5|  11318|        11319|
|       6|   7130|         7131|
|       7|   4530|         4531|
|       8|   2911|         2912|
|       9|   5657|         5658|
|      10|   5648|         5649|
|      11|    918|          919|
|      12|   1837|         1838|
|      13|   9149|         9150|
|      14|   9842|         9843|
|      15|   2568|         2569|
|      16|   7276|         7277|
|      17|   2667|         2668|
|      18|   1205|         1206|
|      19|   9488|         9489|
|      20|   9198|         9199|
+--------+-------+-------------+
only showing top 20 rows



In [12]:
orders_df.select("order_id",orders_df.order_date,orders_df['order_date'], column('cust_id'), col('cust_id'), expr("order_status")).where(col('order_status').like('PENDING%')).show()

+--------+----------+----------+-------+-------+---------------+
|order_id|order_date|order_date|cust_id|cust_id|   order_status|
+--------+----------+----------+-------+-------+---------------+
|       2|2013-07-25|2013-07-25|    256|    256|PENDING_PAYMENT|
|       9|2013-07-25|2013-07-25|   5657|   5657|PENDING_PAYMENT|
|      10|2013-07-25|2013-07-25|   5648|   5648|PENDING_PAYMENT|
|      13|2013-07-25|2013-07-25|   9149|   9149|PENDING_PAYMENT|
|      16|2013-07-25|2013-07-25|   7276|   7276|PENDING_PAYMENT|
|      19|2013-07-25|2013-07-25|   9488|   9488|PENDING_PAYMENT|
|      21|2013-07-25|2013-07-25|   2711|   2711|        PENDING|
|      23|2013-07-25|2013-07-25|   4367|   4367|PENDING_PAYMENT|
|      27|2013-07-25|2013-07-25|   3241|   3241|PENDING_PAYMENT|
|      30|2013-07-25|2013-07-25|  10039|  10039|PENDING_PAYMENT|
|      33|2013-07-25|2013-07-25|   5793|   5793|PENDING_PAYMENT|
|      36|2013-07-25|2013-07-25|   5649|   5649|        PENDING|
|      39|2013-07-25|2013

In [13]:
orders_df.select("order_id",orders_df.order_date,orders_df['order_date'], column('cust_id'), col('cust_id'), expr("order_status")).where("order_status like 'PENDING'").show()

+--------+----------+----------+-------+-------+------------+
|order_id|order_date|order_date|cust_id|cust_id|order_status|
+--------+----------+----------+-------+-------+------------+
|      21|2013-07-25|2013-07-25|   2711|   2711|     PENDING|
|      36|2013-07-25|2013-07-25|   5649|   5649|     PENDING|
|      39|2013-07-25|2013-07-25|   8214|   8214|     PENDING|
|      42|2013-07-25|2013-07-25|   9776|   9776|     PENDING|
|      44|2013-07-25|2013-07-25|  10500|  10500|     PENDING|
|      49|2013-07-25|2013-07-25|   1871|   1871|     PENDING|
|      55|2013-07-25|2013-07-25|   2052|   2052|     PENDING|
|      68|2013-07-25|2013-07-25|   4320|   4320|     PENDING|
|      85|2013-07-25|2013-07-25|   1485|   1485|     PENDING|
|      96|2013-07-25|2013-07-25|   8683|   8683|     PENDING|
|      97|2013-07-25|2013-07-25|  10784|  10784|     PENDING|
|     121|2013-07-26|2013-07-26|   2074|   2074|     PENDING|
|     132|2013-07-26|2013-07-26|    289|    289|     PENDING|
|     15