In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("plans")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [2]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
orders_df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [3]:
orders_df.createOrReplaceTempView("orders")

In [4]:
spark.sql("select * from orders").show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [8]:
spark.sql("""
select order_id,order_status from (select order_id,customer_id,order_status from orders where order_id<500)
where order_id<200
""").show()

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
|       3|       COMPLETE|
|       4|         CLOSED|
|       5|       COMPLETE|
|       6|       COMPLETE|
|       7|       COMPLETE|
|       8|     PROCESSING|
|       9|PENDING_PAYMENT|
|      10|PENDING_PAYMENT|
|      11| PAYMENT_REVIEW|
|      12|         CLOSED|
|      13|PENDING_PAYMENT|
|      14|     PROCESSING|
|      15|       COMPLETE|
|      16|PENDING_PAYMENT|
|      17|       COMPLETE|
|      18|         CLOSED|
|      19|PENDING_PAYMENT|
|      20|     PROCESSING|
+--------+---------------+
only showing top 20 rows



In [9]:
spark.sql("""
select order_id,order_status from (select order_id,customer_id,order_status from orders where order_id<500)
where order_id<200
""").explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'order_status]
+- 'Filter ('order_id < 200)
   +- 'SubqueryAlias __auto_generated_subquery_name
      +- 'Project ['order_id, 'customer_id, 'order_status]
         +- 'Filter ('order_id < 500)
            +- 'UnresolvedRelation [orders], [], false

== Analyzed Logical Plan ==
order_id: bigint, order_status: string
Project [order_id#0L, order_status#3]
+- Filter (order_id#0L < cast(200 as bigint))
   +- SubqueryAlias __auto_generated_subquery_name
      +- Project [order_id#0L, customer_id#2L, order_status#3]
         +- Filter (order_id#0L < cast(500 as bigint))
            +- SubqueryAlias orders
               +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Optimized Logical Plan ==
Project [order_id#0L, order_status#3]
+- Filter ((isnotnull(order_id#0L) AND (order_id#0L < 500)) AND (order_id#0L < 200))
   +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Physical Plan ==
*(1) Filter 

In [10]:
customers_schema = """customerid long, customer_fname string, customer_lname string, username string, password string,
address string, city string, state string, pincode long"""
customers_df = spark.read.format("csv").schema(customers_schema).load("/public/trendytech/retail_db/customers")

In [11]:
customers_df.createOrReplaceTempView("customers")

In [14]:
spark.sql("""select * from orders 
inner join customers
on orders.customer_id == customers.customerid
where order_status='CLOSED'
""").explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('order_status = CLOSED)
   +- 'Join Inner, ('orders.customer_id = 'customers.customerid)
      :- 'UnresolvedRelation [orders], [], false
      +- 'UnresolvedRelation [customers], [], false

== Analyzed Logical Plan ==
order_id: bigint, order_date: string, customer_id: bigint, order_status: string, customerid: bigint, customer_fname: string, customer_lname: string, username: string, password: string, address: string, city: string, state: string, pincode: bigint
Project [order_id#0L, order_date#1, customer_id#2L, order_status#3, customerid#48L, customer_fname#49, customer_lname#50, username#51, password#52, address#53, city#54, state#55, pincode#56L]
+- Filter (order_status#3 = CLOSED)
   +- Join Inner, (customer_id#2L = customerid#48L)
      :- SubqueryAlias orders
      :  +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv
      +- SubqueryAlias customers
         +- Relation[customerid#48L,customer_fname#49,custo

In [15]:
spark.sql("""select customer_id,count(1) from (select * from orders where customer_id in (1,2,3,4,5))
where customer_id in (1,2,3)
group by customer_id
""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['customer_id], ['customer_id, unresolvedalias('count(1), None)]
+- 'Filter 'customer_id IN (1,2,3)
   +- 'SubqueryAlias __auto_generated_subquery_name
      +- 'Project [*]
         +- 'Filter 'customer_id IN (1,2,3,4,5)
            +- 'UnresolvedRelation [orders], [], false

== Analyzed Logical Plan ==
customer_id: bigint, count(1): bigint
Aggregate [customer_id#2L], [customer_id#2L, count(1) AS count(1)#172L]
+- Filter cast(customer_id#2L as bigint) IN (cast(1 as bigint),cast(2 as bigint),cast(3 as bigint))
   +- SubqueryAlias __auto_generated_subquery_name
      +- Project [order_id#0L, order_date#1, customer_id#2L, order_status#3]
         +- Filter cast(customer_id#2L as bigint) IN (cast(1 as bigint),cast(2 as bigint),cast(3 as bigint),cast(4 as bigint),cast(5 as bigint))
            +- SubqueryAlias orders
               +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Optimized Logical Plan ==
Aggregate [customer_i