In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local[1]", "CACHE")
sc.setLogLevel("WARN")

In [3]:
rdd = sc.parallelize(range(1, 11))

In [4]:
def transform1(n):
    print(f"Transform1 {n}")
    return n

rdd_transform1 = rdd.map(lambda x: transform1(x))

In [5]:
def transform2(n):
    print(f"Transform2 {n}")
    return n

rdd_transform2 = rdd_transform1.map(lambda x: transform2(x))

In [6]:
rdd_result1 = rdd_transform2.collect()

In [7]:
rdd_result2 = rdd_transform2.collect()

In [8]:
rdd_result3 = rdd_transform2.sum()

In [9]:
"""
Every call to rdd_transform2 results in an expensive call to it's transformations.
Solution: Caching with .persist()
"""
rdd_transform2.persist()

PythonRDD[1] at collect at <ipython-input-6-7aac3a24c8a2>:1

In [12]:
rdd_transform2.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [11]:
rdd_transform2.is_cached

True

"""
Uncaching: 2 ways:
1. Upon Driver exiting naturally.
2. rdd.unpersist()
"""

In [13]:
rdd_transform2.unpersist()
rdd_transform2.is_cached

False

---

```text
Caching Storage Levels
1. DISK ONLY - slower I/O
2. MEMORY ONLY - faster I/O
3. MEMORY and DISK - balances the above two (MEMORY first, spills-ver to DISK)


rdd.cache(StorageLevel.DISK_ONLY)
rdd.persist() internally calls cache function.
```

In [None]:
"""
.cache() calls .persist() internally, with MEMORY_ONLY.

Call to .cache() is simple., you can't pass StorageLevel options.
Want to control where data's chached? .persist(StorageLevel.<option>)
"""
rdd_transform2.cache()


In [36]:
from pyspark import StorageLevel
rdd_transform2.persist(StorageLevel.DISK_ONLY)

"""
To alter StorageLevel, .unpersist() first.
"""

PythonRDD[2] at collect at <ipython-input-9-f26e13ae07bc>:1

In [38]:
help(StorageLevel)

Help on class StorageLevel in module pyspark.storagelevel:

class StorageLevel(builtins.object)
 |  StorageLevel(useDisk, useMemory, useOffHeap, deserialized, replication=1)
 |  
 |  Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory,
 |  whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory
 |  in a JAVA-specific serialized format, and whether to replicate the RDD partitions on multiple
 |  nodes. Also contains static constants for some commonly used storage levels, MEMORY_ONLY.
 |  Since the data is always serialized on the Python side, all the constants use the serialized
 |  formats.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __str__(self)
 |      Return str(self).
 |  
 |  ---------------------------