In [None]:
import findspark
findspark.init()

In [None]:
"""
Customize before creating SparkSession

NOTE: Options for executors should not exceed the physical memory available with them.
"""

from pyspark.conf import SparkConf

conf = SparkConf()
(
    conf.setMaster("spark://192.168.11.77:7077").setAppName("SS:CONFIG")
    .set("spark.executor.memory", "2G")
    .set("spark.executor.cores", 2)
    .set("spark.cores.max", 2)
    .set("spark.driver.memory", "2G")
)

<br>

# RDD `pyspark.SparkContext`

In [2]:
from pyspark import SparkContext

sc = SparkContext()
sc.setLogLevel("WARN")

In [5]:
type(sc.parallelize(range(1, 5)))

pyspark.rdd.PipelinedRDD

<br>

# DF `pyspark.sql.SparkSession`

In [7]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).master("spark://192.168.11.77:7077").appName("DF").getOrCreate()

dataframe_reader = ss.read

In [8]:
dataframe_data = (
    dataframe_reader
    .option("header", True)
    .option("inferSchema", True)
    .csv("hdfs://192.168.93.128:9000/input/")
)

type(dataframe_data)

pyspark.sql.dataframe.DataFrame

In [9]:
df_us_states = ss.read.format("csv").load("../data/all_us_states.csv")
type(df_us_states)

pyspark.sql.dataframe.DataFrame

In [10]:
df_us_states = ss.read.csv("../data/all_us_states.csv")
df_us_states.show(2)

+----+-------+
| _c0|    _c1|
+----+-------+
|abbr|   name|
|  AL|Alabama|
+----+-------+
only showing top 2 rows



<br>

# Dataset
Dataset APIs are not available in Python yet.

<br>

# Optimization
## DF optimization

In [None]:
"""
OPTION 1 : spark.sql.codegen

This aill ask Spark to compile each SQL query into Java bytecode before executing it.
This codegen option could make longer or repeated queries substantially faster as Spark generates specific code to run them.

BAD: Setting this option for shorter or non-frequently used queries add compiler overhead 
(because compiler is called for query to generate the Java bytecode)
"""

ss = (
    SparkSession
    .builder
    .config("spark.sql.codegen", value=False)
    .getOrCreate()
)

In [None]:
"""
OPTION 2 : spark.sql.inMemoryColumnarStorage.batchSize

Defaut value: 1000
While chching a DF, Spark groups together rows in batches of 1000 and compresses them.
Small batch size --> low compression ratio.
Larger batch size --> better memory utilization and compression.
"""

ss = (
    SparkSession
    .builder
    .config("spark.sql.inMemoryColumnarStorage.batchSize", value=1000)
    .getOrCreate()
)

<br>

## `spark-submit` options

```bash
$ spark-submit --executor-memory 20G --total-executor-cores 100 filename.py
```