In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark import SparkContext
import pyspark
import numpy as np

In [5]:
sc = SparkContext(master="local[*]", appName="SparkApp")

In [6]:
rdd = sc.parallelize(["Hello", "World", "Spark", "Hadoop"])

#### In Python, stored objects will always be serialized with the Pickle library, so it does not matter whether you choose a serialized level. 

In [7]:
print(rdd.getStorageLevel())

Serialized 1x Replicated


# MEMORY_ONLY
* Store RDD as deserialized Java objects in the JVM. 
* If the RDD does not fit in memory, some partitions will not be cached and will be recomputed on the fly each time they are needed. 
* This is the default level.

In [8]:
# rdd.persist(pyspark.StorageLevel.MEMORY_ONLY)
rdd.persist()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

#### Perform action for persistence

In [9]:
rdd.collect()

['Hello', 'World', 'Spark', 'Hadoop']

In [10]:
print(rdd.getStorageLevel())

Memory Serialized 1x Replicated


In [11]:
rdd.unpersist()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

# MEMORY_AND_DISK
* Store RDD as deserialized Java objects in the JVM. 
* If the RDD does not fit in memory, store the partitions that don't fit on disk, and read them from there when they're needed.

In [12]:
rdd.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [13]:
rdd.collect()

['Hello', 'World', 'Spark', 'Hadoop']

In [14]:
print(rdd.getStorageLevel())

Disk Memory Serialized 2x Replicated


In [15]:
rdd.unpersist()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

# DISK_ONLY
Store the RDD partitions only on disk.

In [16]:
rdd.persist(pyspark.StorageLevel.DISK_ONLY)

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [17]:
rdd.collect()

['Hello', 'World', 'Spark', 'Hadoop']

In [18]:
print(rdd.getStorageLevel())

Disk Serialized 1x Replicated


In [19]:
rdd.unpersist()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195