In [2]:
import pyspark

from src.utils import create_spark_session

In [3]:
spark = create_spark_session("Intro")

## Spark Context, UI and Config

### Spark UI

More details about the UI will be covered in Chap 18: Monitoring and Debugging

To access the UI

- Spark UI is available on port 4040 of the driver node. i.e. `localhost:4040`. 
- Print out the `sparkContext` object, and it includes a hyperlink to the UI. see below

In [4]:
# sparkSession
spark

In [5]:
# sparkContext object
spark.sparkContext

In [6]:
# Get all the configs
spark.sparkContext.getConf().getAll()

[('spark.driver.port', '36853'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'local-1585799269671'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '172.27.141.95'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'Intro')]

--- 

In [7]:
myRange = spark.range(1000).toDF("number")

## Parallelism: computation resources vs. partitions

- `partitions`: how data is physically distributed on a cluster (*A partition is a collection of rows that sit on one physical machine*)
- `executors - computation resources`: how much can you compute parallely
- `parallelism` ~ `min(# of executors, # of partitions)`

## Transformations vs Actions

- core data structures are `immutable`

### `transformation` 

instructions on how to `modify` (derive) the core data structures -> a logical execution plan

- `narrow transformation (pipelining)`: 
    - 1-to-1 transformation (i.e. one input contributes to at most one output); 
    - `pipelining`: in *memory*; 
    - e.g. filter/map
- `wide transoformation (shuffle)`: 
    - 1-to-N transformation (i.e. one input contributes to more than one output); 
    - `shuffle`: write results to *disk*;
    - e.g. aggregation

#### Lazy Evaluation

- build up a plan of logical transformations (Directed Acyclic Graph)
- compile DAG to optimized physical plan
    - e.g. `predicate pushdown`: still can pushdown the filter even the operation is specified at the end of the transformations

### `action`: perform the transformations

3 types of Actions:

- view data in the console
- collect data to native objects in the respective language
- write to output data sources


In [8]:
# transform
divisBy2 = myRange.where("number % 2 = 0")

# action
divisBy2.count()

500

In [9]:
divisBy2.write.csv('ha.csv')

## End-to-end example

In [10]:
flightData2015 = spark\
    .read\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .csv('data/flight-data/csv/2015-summary.csv')

In [11]:
flightData2015.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [12]:
flightData2015

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

## the Explain plan

- From top to bottom: end result -> source of data
- Exchange: wide transformation
- File scan: narrow transformation

In [13]:
flightData2015.sort('count').explain()

== Physical Plan ==
*(2) Sort [count#23 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#23 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#21,ORIGIN_COUNTRY_NAME#22,count#23] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/guava/projects/learn-spark/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [14]:
# default shuffle output partitions = 200 -> too much for our case
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [15]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

#### What just happened

- *Logical*: csv: read(narrow)-> Dataframe: sort(wide)-> Dataframe take(3)-> Array
- *Physical*: 1 partition -> 5 partitions

### Dataframe and SQL

In [16]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [17]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [18]:
dataFrameWay = flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .count()

In [19]:
sqlWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#21, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#21] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/guava/projects/learn-spark/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [20]:
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#21, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#21] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/guava/projects/learn-spark/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [21]:
# Max( count )
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)

[Row(max(count)=370002)]

In [22]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [23]:
# top five destination countries in the data

## sql
maxSql = spark.sql("""
SELECT DEST_COUNTRy_NAME, SUM(count) AS destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY destination_total DESC
LIMIT 5
""")
maxSql.show()

## data frame
from pyspark.sql.functions import desc
flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .show()



+-----------------+-----------------+
|DEST_COUNTRy_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



Dataframe methods accept:
1. strings ( as column names)
2. `Column` types or expressions (i.e. `desc("destination_total)`

In [24]:
flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#104L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#21,destination_total#104L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[sum(cast(count#23 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#21, 5)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#21], functions=[partial_sum(cast(count#23 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#21,count#23] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/guava/projects/learn-spark/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
