In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.appName('cache').getOrCreate()

In [6]:
d1 =[
    (1,"Alice"),
    (2,"Bob"),
    (3,"Charlie"),
    (4,"David"),
    (5,"Eve")
]
df1 = spark.createDataFrame(d1,["id","name"])

In [None]:
df1.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
|  5|    Eve|
+---+-------+



In [None]:
df1 = df1.withColumn('flag',lit('Yes'))

In [None]:
display(df1)

DataFrame[id: bigint, name: string, flag: string]

In [None]:
df1.show()

+---+-------+----+
| id|   name|flag|
+---+-------+----+
|  1|  Alice| Yes|
|  2|    Bob| Yes|
|  3|Charlie| Yes|
|  4|  David| Yes|
|  5|    Eve| Yes|
+---+-------+----+



In [None]:
df1.cache()

DataFrame[id: bigint, name: string, flag: string]

In [None]:
df2 = df1.filter(col('id')==2)

In [None]:
df2.show()

+---+----+----+
| id|name|flag|
+---+----+----+
|  2| Bob| Yes|
+---+----+----+



In [None]:
df2.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(id#43L) AND (id#43L = 2))
   +- InMemoryTableScan [id#43L, name#44, flag#56], [isnotnull(id#43L), (id#43L = 2)]
         +- InMemoryRelation [id#43L, name#44, flag#56], StorageLevel(disk, memory, deserialized, 1 replicas)
               +- *(1) Project [id#43L, name#44, Yes AS flag#56]
                  +- *(1) Scan ExistingRDD[id#43L,name#44]




In [8]:
from pyspark.storagelevel import StorageLevel

In [10]:
df1.persist(StorageLevel.MEMORY_ONLY)

DataFrame[id: bigint, name: string]

In [11]:
df2 = df1.filter(col('id')==2)

In [12]:
df2.show()

+---+----+
| id|name|
+---+----+
|  2| Bob|
+---+----+



In [14]:
df2.explain(True)

== Parsed Logical Plan ==
'Filter ('id = 2)
+- LogicalRDD [id#0L, name#1], false

== Analyzed Logical Plan ==
id: bigint, name: string
Filter (id#0L = cast(2 as bigint))
+- LogicalRDD [id#0L, name#1], false

== Optimized Logical Plan ==
Filter (isnotnull(id#0L) AND (id#0L = 2))
+- InMemoryRelation [id#0L, name#1], StorageLevel(memory, 1 replicas)
      +- *(1) Scan ExistingRDD[id#0L,name#1]

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(id#0L) AND (id#0L = 2))
   +- InMemoryTableScan [id#0L, name#1], [isnotnull(id#0L), (id#0L = 2)]
         +- InMemoryRelation [id#0L, name#1], StorageLevel(memory, 1 replicas)
               +- *(1) Scan ExistingRDD[id#0L,name#1]



In [15]:
df2.unpersist()

DataFrame[id: bigint, name: string]

In [16]:
df2.show()

+---+----+
| id|name|
+---+----+
|  2| Bob|
+---+----+



In [18]:
df2.explain(True)

== Parsed Logical Plan ==
'Filter ('id = 2)
+- LogicalRDD [id#0L, name#1], false

== Analyzed Logical Plan ==
id: bigint, name: string
Filter (id#0L = cast(2 as bigint))
+- LogicalRDD [id#0L, name#1], false

== Optimized Logical Plan ==
Filter (isnotnull(id#0L) AND (id#0L = 2))
+- InMemoryRelation [id#0L, name#1], StorageLevel(memory, 1 replicas)
      +- *(1) Scan ExistingRDD[id#0L,name#1]

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(id#0L) AND (id#0L = 2))
   +- InMemoryTableScan [id#0L, name#1], [isnotnull(id#0L), (id#0L = 2)]
         +- InMemoryRelation [id#0L, name#1], StorageLevel(memory, 1 replicas)
               +- *(1) Scan ExistingRDD[id#0L,name#1]

