To start jupyter notebook
```
$ PYSPARK_DRIVER_PYTHON="jupyter" PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark
[--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12]
```

where dependent graphframe pkg is installed at
```
$SPARK_HOME/jars/graphframes-0.8.1-spark3.0-s_2.12.jar
```

See https://github.com/wgong/py4kids/blob/master/lesson-17-pyspark/spark-guide/notebook/chapter-02-intro.ipynb

In [1]:
spark

### A simple example

In [2]:
myRange = spark.range(10).toDF("number")
myRange.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
+------+



In [4]:
divisBy2 = myRange.where("number % 2 = 0")
#divisBy2.collect()
divisBy2.show()

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+



In [5]:
from pyspark.sql.types import *

In [6]:
type(myRange)

pyspark.sql.dataframe.DataFrame

### An end-2-end example

In [4]:
import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [6]:
SPARK_BOOK_DATA_PATH = '/home/wengong/spark_data/'

#### read from file

In [7]:
file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/csv/2015-summary.csv"
flightData2015 = (
    spark
    .read
    .option("inferSchema", "true")
    .option("header", "true")
    .csv(file_path)
)

short form:

`flightData2015 = spark.read.csv(file_path, header=True, inferSchema=True)`

[Spark SQL data sources](https://spark.apache.org/docs/latest/sql-data-sources.html)

#### check schema

In [16]:
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [17]:
flightData2015.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,IntegerType,true)))

In [18]:
flightData2015.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

#### Check data

In [14]:
flightData2015.count()

256

In [9]:
flightData2015.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



#### write out to parquet

In [10]:
out_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/parquet/2015-summary.parquet"

In [11]:
(
    flightData2015.write
        .format("parquet")
        .mode("overwrite")
        .save(out_path)
)

In [12]:
# read back saved parquet data
flightData2015_2 = (
    spark
    .read
    .format("parquet")
    .load(out_path)
)

In [13]:
flightData2015_2.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [15]:
flightData2015_2.count()

256

#### configure shuffle partition

by default, Spark outputs 200 shuffle partition, one can reset it

In [33]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Transformation and Action in DataFrames

In [34]:
from pyspark.sql.functions import desc, max

In [35]:
## find top 5 destination country

# transformation
top5_destDF = (
  flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
)

# action
top5_destDF.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [36]:
# see execution plan
top5_destDF.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#186L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#16,destination_total#186L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[sum(cast(count#18 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#16, 5), true, [id=#411]
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[partial_sum(cast(count#18 as bigint))])
         +- FileScan csv [DEST_COUNTRY_NAME#16,count#18] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/wengong/spark_data/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




In [23]:
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

#### Transformation and Action in SQL

In [24]:
# create temp Table on DataFrame
flightData2015.createOrReplaceTempView("flight_data_2015")

In [30]:
sqlDF = spark.sql("""
    SELECT DEST_COUNTRY_NAME, sum(count)
    FROM flight_data_2015
    GROUP BY DEST_COUNTRY_NAME
    ORDER BY sum(count) desc
    LIMIT 5
""")

In [31]:
sqlDF.show()

+-----------------+----------+
|DEST_COUNTRY_NAME|sum(count)|
+-----------------+----------+
|    United States|    411352|
|           Canada|      8399|
|           Mexico|      7140|
|   United Kingdom|      2025|
|            Japan|      1548|
+-----------------+----------+



In [32]:
sqlDF.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[aggOrder#162L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#16,sum(count)#161L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[sum(cast(count#18 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#16, 200), true, [id=#349]
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#16], functions=[partial_sum(cast(count#18 as bigint))])
         +- FileScan csv [DEST_COUNTRY_NAME#16,count#18] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/wengong/spark_data/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




__Note__: `top5_destDF` and `sqlDF` have the same Physical Plan

### UDF

create age group

In [16]:
ageDF = spark.range(100).toDF("age")

In [17]:
ageDF.show(5)

+---+
|age|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [18]:
ageDF.printSchema()

root
 |-- age: long (nullable = false)



In [19]:
from pyspark.sql.functions import udf
age_range = udf(lambda age: 
                   '< 20' if age < 20 else 
                   '20-40' if (age >= 20 and age < 40) else
                   '40-60' if (age >= 40 and age < 60) else
                   '60-80' if (age >= 60 and age < 80) else
                   '80+'  if (age >= 80) else ''
)

ageDF = ageDF.withColumn('age_group', age_range(ageDF.age))

In [20]:
ageDF.show(50)

+---+---------+
|age|age_group|
+---+---------+
|  0|     < 20|
|  1|     < 20|
|  2|     < 20|
|  3|     < 20|
|  4|     < 20|
|  5|     < 20|
|  6|     < 20|
|  7|     < 20|
|  8|     < 20|
|  9|     < 20|
| 10|     < 20|
| 11|     < 20|
| 12|     < 20|
| 13|     < 20|
| 14|     < 20|
| 15|     < 20|
| 16|     < 20|
| 17|     < 20|
| 18|     < 20|
| 19|     < 20|
| 20|    20-40|
| 21|    20-40|
| 22|    20-40|
| 23|    20-40|
| 24|    20-40|
| 25|    20-40|
| 26|    20-40|
| 27|    20-40|
| 28|    20-40|
| 29|    20-40|
| 30|    20-40|
| 31|    20-40|
| 32|    20-40|
| 33|    20-40|
| 34|    20-40|
| 35|    20-40|
| 36|    20-40|
| 37|    20-40|
| 38|    20-40|
| 39|    20-40|
| 40|    40-60|
| 41|    40-60|
| 42|    40-60|
| 43|    40-60|
| 44|    40-60|
| 45|    40-60|
| 46|    40-60|
| 47|    40-60|
| 48|    40-60|
| 49|    40-60|
+---+---------+
only showing top 50 rows

