In [1]:
import sys
sys.path.append('/home/iceberg/notebooks/PyCon_LT_Workshop')

from helpers.utils import get_spark_session, get_yellow_taxi_data, get_dim_data, ROOT_DIR
from pyspark.sql import functions as f
spark = get_spark_session("caching")

yellow_taxi_data = get_yellow_taxi_data(spark=spark)

dim_taxi_zones, dim_rates, dim_payments, dim_vendor = get_dim_data(spark)

24/02/17 09:21:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Persisting

We can persist dataframe by using
```python
df.persist()
```

__df.cache() = df.persist(StorageLevel.MEMORY_ONLY)__

There are several different kinds of persisting in PySpark:

*Source: http://spark.apache.org/docs/latest/rdd-programming-guide.html*


<table class="table">
<tr><th style="width:23%">Storage Level</th><th>Meaning</th></tr>
<tr>
  <td> MEMORY_ONLY </td>
  <td> Store RDD as deserialized Java objects in the JVM. If the RDD does not fit in memory, some partitions will
    not be cached and will be recomputed on the fly each time they're needed. This is the default level. </td>
</tr>
<tr>
  <td> MEMORY_AND_DISK </td>
  <td> Store RDD as deserialized Java objects in the JVM. If the RDD does not fit in memory, store the
    partitions that don't fit on disk, and read them from there when they're needed. </td>
</tr>

<tr>
  <td> DISK_ONLY </td>
  <td> Store the RDD partitions only on disk. </td>
</tr>
<tr>
  <td> MEMORY_ONLY_2, MEMORY_AND_DISK_2, etc.  </td>
  <td> Same as the levels above, but replicate each partition on two cluster nodes. </td>
</tr>
</table>

## Reminder how it looks in Spark UI
<img src="https://miro.medium.com/max/2864/1*qWs3f3vaxTenH3WrGmlBlg.png">

## Why persist/cache?
<img src="https://miro.medium.com/max/3288/1*aWEf5_y4SuqkgpbdbZkZEA.png">

# Hands on!

In [7]:
yellow_taxi_data.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [4]:
from pyspark.sql.functions import countDistinct
spark.catalog.clearCache()
yellow_taxi_cmt = yellow_taxi_data.filter("VendorID==1")

In [16]:
%timeit -n 10 yellow_taxi_cmt.count(), yellow_taxi_cmt.groupBy("RatecodeID").count().show(10, False)

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+---------

In [10]:
spark.catalog.clearCache()

In [11]:
from pyspark.storagelevel import StorageLevel
yellow_taxi_cmt.persist(StorageLevel.MEMORY_ONLY) # same as yellow_taxi_cmt.cache()!

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

**Dataframes are persisted only after first action was called on them.**

In [15]:
%timeit -n 10 yellow_taxi_cmt.count(), yellow_taxi_cmt.groupBy("RatecodeID").count().show(10, False)

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+----------+-------+

+----------+-------+
|RatecodeID|count  |
+----------+-------+
|NULL      |37895  |
|1.0       |1529513|
|4.0       |1609   |
|3.0       |2422   |
|2.0       |37115  |
|99.0      |19577  |
|6.0       |20     |
|5.0       |4810   |
+---------

# Unpersist when you don't need the data!

In [17]:
yellow_taxi_cmt.unpersist()

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]