# Benchmark help function

In [4]:
def benchmark(name: String)(f: => Unit) {
  val startTime = System.nanoTime
  f
  val endTime = System.nanoTime
  println(s"Time taken in $name: " + (endTime - startTime).toDouble / 1000000000 + " seconds")
}

benchmark: (name: String)(f: => Unit)Unit


# Get and set configuration

In [10]:
spark.conf.get("spark.sql.codegen.wholeStage")

res8: String = true


In [9]:
spark.conf.set("spark.sql.codegen.wholeStage", true)

In [11]:
benchmark("Spark 2.0") {
  spark.range(1000L * 1000 * 1000).selectExpr("sum(id)").show()
}

+------------------+
|           sum(id)|
+------------------+
|499999999500000000|
+------------------+

Time taken in Spark 2.0: 5.384864876 seconds


In [12]:
spark.range(1000L * 1000 * 1000).join(spark.range(1000L).toDF(), "id").selectExpr("count(*)").explain(true)

== Parsed Logical Plan ==
'Project [unresolvedalias('count(1), Some(org.apache.spark.sql.Column$$Lambda$2986/0x0000000841215040@2d4cb313))]
+- Project [id#33L]
   +- Join Inner, (id#33L = id#35L)
      :- Range (0, 1000000000, step=1, splits=Some(4))
      +- Range (0, 1000, step=1, splits=Some(4))

== Analyzed Logical Plan ==
count(1): bigint
Aggregate [count(1) AS count(1)#40L]
+- Project [id#33L]
   +- Join Inner, (id#33L = id#35L)
      :- Range (0, 1000000000, step=1, splits=Some(4))
      +- Range (0, 1000, step=1, splits=Some(4))

== Optimized Logical Plan ==
Aggregate [count(1) AS count(1)#40L]
+- Project
   +- Join Inner, (id#33L = id#35L)
      :- Range (0, 1000000000, step=1, splits=Some(4))
      +- Range (0, 1000, step=1, splits=Some(4))

== Physical Plan ==
*(3) HashAggregate(keys=[], functions=[count(1)], output=[count(1)#40L])
+- Exchange SinglePartition, true, [id=#94]
   +- *(2) HashAggregate(keys=[], functions=[partial_count(1)], output=[count#43L])
      +- *(2) Pro

# Spark Debug tools

In [19]:
import org.apache.spark.sql.execution.debug._

import org.apache.spark.sql.execution.debug._


In [22]:
val q = spark.range(1,10).toDF().filter('id>3).selectExpr("sum(id)")

q: org.apache.spark.sql.DataFrame = [sum(id): bigint]


In [23]:
q.debug

java.lang.NullPointerException: 

In [18]:
q.debugCodegen

Found 2 WholeStageCodegen subtrees.
== Subtree 1 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:208(0.32% used); numInnerClasses:0) ==
*(1) HashAggregate(keys=[], functions=[partial_sum(id#44L)], output=[sum#51L])
+- *(1) Filter (id#44L > 3)
   +- *(1) Range (1, 10, step=1, splits=4)

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private boolean agg_initAgg_0;
/* 010 */   private boolean agg_bufIsNull_0;
/* 011 */   private long agg_bufValue_0;
/* 012 */   private boolean range_initRange_0;
/* 013 */   private long range_nextIndex_0;
/* 014 */   private TaskContext range_taskContext_0;
/* 015 */   private InputMetrics

In [24]:
q.queryExecution.debug.codegen

Found 2 WholeStageCodegen subtrees.
== Subtree 1 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:208(0.32% used); numInnerClasses:0) ==
*(1) HashAggregate(keys=[], functions=[partial_sum(id#52L)], output=[sum#59L])
+- *(1) Filter (id#52L > 3)
   +- *(1) Range (1, 10, step=1, splits=4)

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private boolean agg_initAgg_0;
/* 010 */   private boolean agg_bufIsNull_0;
/* 011 */   private long agg_bufValue_0;
/* 012 */   private boolean range_initRange_0;
/* 013 */   private long range_nextIndex_0;
/* 014 */   private TaskContext range_taskContext_0;
/* 015 */   private InputMetrics




# New Explain Method

In [25]:
q.explain("simple")

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[sum(id#52L)])
+- Exchange SinglePartition, true, [id=#141]
   +- *(1) HashAggregate(keys=[], functions=[partial_sum(id#52L)])
      +- *(1) Filter (id#52L > 3)
         +- *(1) Range (1, 10, step=1, splits=4)




In [26]:
q.explain("extended")

== Parsed Logical Plan ==
'Project [unresolvedalias('sum('id), Some(org.apache.spark.sql.Column$$Lambda$2986/0x0000000841215040@2d4cb313))]
+- Filter (id#52L > cast(3 as bigint))
   +- Range (1, 10, step=1, splits=Some(4))

== Analyzed Logical Plan ==
sum(id): bigint
Aggregate [sum(id#52L) AS sum(id)#56L]
+- Filter (id#52L > cast(3 as bigint))
   +- Range (1, 10, step=1, splits=Some(4))

== Optimized Logical Plan ==
Aggregate [sum(id#52L) AS sum(id)#56L]
+- Filter (id#52L > 3)
   +- Range (1, 10, step=1, splits=Some(4))

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[sum(id#52L)], output=[sum(id)#56L])
+- Exchange SinglePartition, true, [id=#141]
   +- *(1) HashAggregate(keys=[], functions=[partial_sum(id#52L)], output=[sum#59L])
      +- *(1) Filter (id#52L > 3)
         +- *(1) Range (1, 10, step=1, splits=4)



In [27]:
q.explain("codegen")

Found 2 WholeStageCodegen subtrees.
== Subtree 1 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:208(0.32% used); numInnerClasses:0) ==
*(1) HashAggregate(keys=[], functions=[partial_sum(id#52L)], output=[sum#59L])
+- *(1) Filter (id#52L > 3)
   +- *(1) Range (1, 10, step=1, splits=4)

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private boolean agg_initAgg_0;
/* 010 */   private boolean agg_bufIsNull_0;
/* 011 */   private long agg_bufValue_0;
/* 012 */   private boolean range_initRange_0;
/* 013 */   private long range_nextIndex_0;
/* 014 */   private TaskContext range_taskContext_0;
/* 015 */   private InputMetrics

In [28]:
q.explain("cost")

== Optimized Logical Plan ==
Aggregate [sum(id#52L) AS sum(id)#56L], Statistics(sizeInBytes=16.0 B, rowCount=1)
+- Filter (id#52L > 3)
   +- Range (1, 10, step=1, splits=Some(4))

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[sum(id#52L)], output=[sum(id)#56L])
+- Exchange SinglePartition, true, [id=#141]
   +- *(1) HashAggregate(keys=[], functions=[partial_sum(id#52L)], output=[sum#59L])
      +- *(1) Filter (id#52L > 3)
         +- *(1) Range (1, 10, step=1, splits=4)




In [29]:
q.explain("formatted")

== Physical Plan ==
* HashAggregate (5)
+- Exchange (4)
   +- * HashAggregate (3)
      +- * Filter (2)
         +- * Range (1)


(1) Range [codegen id : 1]
Output [1]: [id#52L]
Arguments: Range (1, 10, step=1, splits=Some(4))

(2) Filter [codegen id : 1]
Input [1]: [id#52L]
Condition : (id#52L > 3)

(3) HashAggregate [codegen id : 1]
Input [1]: [id#52L]
Keys: []
Functions [1]: [partial_sum(id#52L)]
Aggregate Attributes [1]: [sum#58L]
Results [1]: [sum#59L]

(4) Exchange
Input [1]: [sum#59L]
Arguments: SinglePartition, true, [id=#141]

(5) HashAggregate [codegen id : 2]
Input [1]: [sum#59L]
Keys: []
Functions [1]: [sum(id#52L)]
Aggregate Attributes [1]: [sum(id#52L)#55L]
Results [1]: [sum(id#52L)#55L AS sum(id)#56L]


