# Intro to Spark and PySpark


In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
from pathlib import Path

## Reading parquets

Instantiate a spark session

In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

23/02/17 22:42:34 WARN Utils: Your hostname, Kohada resolves to a loopback address: 127.0.1.1; using 172.30.125.167 instead (on interface eth0)
23/02/17 22:42:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/17 22:42:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
!wc -l ../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet

1006794 ../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet


Spark session reads the parquet in as `pyspark.sql.Dataframe`

In [6]:
fpath = Path("../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet")
df = spark.read \
    .option('header', 'true') \
    .parquet(str(fpath))
df.show(10)



+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

                                                                                

The sparks UI can be accessed via http://localhost:4040/jobs/

![spark-ui](./img/spark-ui-1.png)

In [None]:
df.schema

StructType(List(StructField(hvfhs_license_num,StringType,true),StructField(dispatching_base_num,StringType,true),StructField(originating_base_num,StringType,true),StructField(request_datetime,TimestampType,true),StructField(on_scene_datetime,TimestampType,true),StructField(pickup_datetime,TimestampType,true),StructField(dropoff_datetime,TimestampType,true),StructField(PULocationID,LongType,true),StructField(DOLocationID,LongType,true),StructField(trip_miles,DoubleType,true),StructField(trip_time,LongType,true),StructField(base_passenger_fare,DoubleType,true),StructField(tolls,DoubleType,true),StructField(bcf,DoubleType,true),StructField(sales_tax,DoubleType,true),StructField(congestion_surcharge,DoubleType,true),StructField(airport_fee,DoubleType,true),StructField(tips,DoubleType,true),StructField(driver_pay,DoubleType,true),StructField(shared_request_flag,StringType,true),StructField(shared_match_flag,StringType,true),StructField(access_a_ride_flag,StringType,true),StructField(wav_req

Formatted:

```scala
StructType(List(
    StructField(hvfhs_license_num,StringType,true),
    StructField(dispatching_base_num,StringType,true),
    StructField(originating_base_num,StringType,true),
    StructField(request_datetime,TimestampType,true),
    StructField(on_scene_datetime,TimestampType,true),
    StructField(pickup_datetime,TimestampType,true),
    StructField(dropoff_datetime,TimestampType,true),
    StructField(PULocationID,LongType,true),
    StructField(DOLocationID,LongType,true),
    StructField(trip_miles,DoubleType,true),
    StructField(trip_time,LongType,true),
    StructField(base_passenger_fare,DoubleType,true),
    StructField(tolls,DoubleType,true),
    StructField(bcf,DoubleType,true),
    StructField(sales_tax,DoubleType,true),
    StructField(congestion_surcharge,DoubleType,true),
    StructField(airport_fee,DoubleType,true),
    StructField(tips,DoubleType,true),
    StructField(driver_pay,DoubleType,true),
    StructField(shared_request_flag,StringType,true),
    StructField(shared_match_flag,StringType,true),
    StructField(access_a_ride_flag,StringType,true),
    StructField(wav_request_flag,StringType,true),
    StructField(wav_match_flag,StringType,true)
    )
)
```

- `scala` syntax
- Double: float (8 bytes)
- Long: int64 (8 bytes)
- `true` refers to `is_nullable`

So contrary to the `.csv`, the parquet types seem to be correct.

Otherwise, Spark can read directly from a `pd.DataFrame`, where we can enforce the schema before feeding to spark

In [14]:
!head -n 1001 ../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet > ../data/taxi_ingest_data/fhvhv_tripdata_sample.parquet
!tail -n 1 ../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet >> ../data/taxi_ingest_data/fhvhv_tripdata_sample.parquet


In [16]:
try:
    df_pd = pd.read_parquet(fpath.parent / 'fhv_tripdata_sample.parquet',)
    df_pd.dtypes
except OSError as e:
    print("Difficult to read only parts of a parquet with only pandas\n", e)

Difficult to read only parts of a parquet with only pandas
 Invalid column metadata (corrupt file?)


Use `pyarrow` directly to access the `nrows` arg

[SOF source](https://stackoverflow.com/a/69888274/5496416)

In [18]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa

# reader interface
pf = ParquetFile(fpath)
first_nrows = next(pf.iter_batches(batch_size=1000))
df_pd = pa.Table.from_batches([first_nrows]).to_pandas()
df_pd.dtypes

hvfhs_license_num               object
dispatching_base_num            object
originating_base_num            object
request_datetime        datetime64[ns]
on_scene_datetime       datetime64[ns]
pickup_datetime         datetime64[ns]
dropoff_datetime        datetime64[ns]
PULocationID                     int64
DOLocationID                     int64
trip_miles                     float64
trip_time                        int64
base_passenger_fare            float64
tolls                          float64
bcf                            float64
sales_tax                      float64
congestion_surcharge           float64
airport_fee                    float64
tips                           float64
driver_pay                     float64
shared_request_flag             object
shared_match_flag               object
access_a_ride_flag              object
wav_request_flag                object
wav_match_flag                  object
dtype: object

In [22]:
df_from_pd = spark.createDataFrame(df_pd)
df_from_pd.head(3)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime=datetime.datetime(2021, 1, 1, 0, 28, 9), on_scene_datetime=datetime.datetime(2021, 1, 1, 0, 31, 42), pickup_datetime=datetime.datetime(2021, 1, 1, 0, 33, 44), dropoff_datetime=datetime.datetime(2021, 1, 1, 0, 49, 7), PULocationID=230, DOLocationID=166, trip_miles=5.26, trip_time=923, base_passenger_fare=22.28, tolls=0.0, bcf=0.67, sales_tax=1.98, congestion_surcharge=2.75, airport_fee=nan, tips=0.0, driver_pay=14.99, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N'),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime=datetime.datetime(2021, 1, 1, 0, 45, 56), on_scene_datetime=datetime.datetime(2021, 1, 1, 0, 55, 19), pickup_datetime=datetime.datetime(2021, 1, 1, 0, 55, 19), dropoff_datetime=datetime.datetime(2021, 1, 1, 1, 18, 21), PULocationID=152, DOL

We can extract the `scala` types above and convert it to python via `pyspark.sql.types`, and revise the schema during read. E.g. change `locationID` to `Integer` instead of `Long` to reduce storage requirements, since `Long` takes up 8 bytes (int64) and `Integer` takes up 4 (int32)

In [46]:
from pyspark.sql import types
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('originating_base_num', types.StringType(), True),
    types.StructField('request_datetime', types.TimestampType(), True),
    types.StructField('on_scene_datetime', types.TimestampType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.LongType(), True),
    types.StructField('DOLocationID', types.LongType(), True),
    types.StructField('trip_miles', types.DoubleType(), True),
    types.StructField('trip_time', types.LongType(), True),
    types.StructField('base_passenger_fare', types.DoubleType(), True),
    types.StructField('tolls', types.DoubleType(), True),
    types.StructField('bcf', types.DoubleType(), True),
    types.StructField('sales_tax', types.DoubleType(), True),
    types.StructField('congestion_surcharge', types.DoubleType(), True),
    types.StructField('airport_fee', types.DoubleType(), True),
    types.StructField('tips', types.DoubleType(), True),
    types.StructField('driver_pay', types.DoubleType(), True),
    types.StructField('shared_request_flag', types.StringType(), True),
    types.StructField('shared_match_flag', types.StringType(), True),
    types.StructField('access_a_ride_flag', types.StringType(), True),
    types.StructField('wav_request_flag', types.StringType(), True),
    types.StructField('wav_match_flag', types.StringType(), True)
    ]
)


In [47]:
df_with_schema = spark.read \
    .option('header', 'true') \
    .schema(schema=schema) \
    .parquet(str(fpath))

In [48]:
df_with_schema.head(3)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime=datetime.datetime(2021, 1, 1, 8, 28, 9), on_scene_datetime=datetime.datetime(2021, 1, 1, 8, 31, 42), pickup_datetime=datetime.datetime(2021, 1, 1, 8, 33, 44), dropoff_datetime=datetime.datetime(2021, 1, 1, 8, 49, 7), PULocationID=230, DOLocationID=166, trip_miles=5.26, trip_time=923, base_passenger_fare=22.28, tolls=0.0, bcf=0.67, sales_tax=1.98, congestion_surcharge=2.75, airport_fee=None, tips=0.0, driver_pay=14.99, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N'),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime=datetime.datetime(2021, 1, 1, 8, 45, 56), on_scene_datetime=datetime.datetime(2021, 1, 1, 8, 55, 19), pickup_datetime=datetime.datetime(2021, 1, 1, 8, 55, 19), dropoff_datetime=datetime.datetime(2021, 1, 1, 9, 18, 21), PULocationID=152, DO

Couldn't convert from `LongType` to `IntegerType`; parquet complained about expecting `INT64`. Stay as `Long`.

## Repartition

The whole idea of spark is distributing the dataset and parallelizing the task between the executors in the cluster. Thus we should break out our parquet into smaller *partitions*  so that the workload can be more easily distributed.

Otherwise, one executor in the cluster will receive the one dataset, work on it by itself, while the other executors sit idle.

Use `df.repartition(int)`

In [49]:
# lazy eval; will not execute yet
# returns a new df that is hash partitioned
df_parts = df_with_schema.repartition(24)

Do not create the directory for `df.write.parquet()`; it will raise `Path already exists` error. Let it create it on its own

In [5]:
fpath = Path("../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet")
partition_dir = fpath.parent / 'fhvhv' / 'parts'
# if not partition_dir.exists():
#     partition_dir.mkdir(parents=True)

In [51]:
# write our df into 24 partitions
df_parts.write.parquet(str(partition_dir))

                                                                                

Output:

```bash
kohada@Kohada:~/de-zoomcamp/data/taxi_ingest_data$ ls fhvhv/parts/ -lh
total 526M
-rw-r--r-- 1 kohada kohada   0 Feb 18 00:40 _SUCCESS
-rw-r--r-- 1 kohada kohada 22M Feb 18 00:39 part-00000-0c7da773-7f90-44bc-9962-54679a206388-c000.snappy.parquet
-rw-r--r-- 1 kohada kohada 22M Feb 18 00:39 part-00001-0c7da773-7f90-44bc-9962-54679a206388-c000.snappy.parquet
-rw-r--r-- 1 kohada kohada 22M Feb 18 00:39 part-00002-0c7da773-7f90-44bc-9962-54679a206388-c000.snappy.parquet
-rw-r--r-- 1 kohada kohada 22M Feb 18 00:39 part-00003-0c7da773-7f90-44bc-9962-54679a206388-c000.snappy.parquet
...
-rw-r--r-- 1 kohada kohada 22M Feb 18 00:40 part-00023-0c7da773-7f90-44bc-9962-54679a206388-c000.snappy.parquet
```

Partitions can be read by pointing to the folder

In [3]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()   

23/02/18 09:57:07 WARN Utils: Your hostname, Kohada resolves to a loopback address: 127.0.1.1; using 172.30.114.23 instead (on interface eth0)
23/02/18 09:57:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/18 09:57:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
fpath = Path("../data/taxi_ingest_data/fhvhv_tripdata_2021-01.parquet")
partition_dir = fpath.parent / 'fhvhv' / 'parts'
df = spark.read.parquet(str(partition_dir))

                                                                                

In [8]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

## Basic Manipulation

In [10]:
# select and filter - lazy
df_filter = df.select('pickup_datetime', 
            'dropoff_datetime', 
            'PULocationID', 
            'DOLocationID',) \
            .filter(df.hvfhs_license_num == 'HV0003')

### Action vs Transformation

#### Transformations

Lazy eval; not executed until triggered by action

- select
- filter
- join
- group by

#### Actions

Executes the transformations (eager, as opposed to lazy)

- show
- take (similar to `.head()`)
- write

In [11]:
# action
df_filter.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-12 02:40:22|2021-01-12 03:15:49|         262|         231|
|2021-01-05 23:13:22|2021-01-05 23:27:50|          61|         181|
|2021-02-01 02:42:09|2021-02-01 02:59:52|         232|           4|
|2021-01-28 06:24:36|2021-01-28 06:26:43|          68|          68|
|2021-01-30 16:35:46|2021-01-30 16:39:42|         256|         255|
|2021-01-16 10:25:35|2021-01-16 10:34:21|          89|          91|
|2021-01-11 19:58:23|2021-01-11 20:14:19|          97|          61|
|2021-01-03 15:44:58|2021-01-03 16:04:45|          26|         178|
|2021-01-15 02:52:00|2021-01-15 03:19:00|         181|         198|
|2021-01-09 04:35:35|2021-01-09 05:06:33|          76|          91|
|2021-01-15 21:49:48|2021-01-15 22:35:23|         246|          16|
|2021-01-27 18:37:56|2021-01-27 18:53:35|       

                                                                                

### Spark vs SQL

The strength of Spark is in the flexibility given by whichever code we're writing our spark worload in. The above snippet is easily achievable in SQL, but with UDF, there's a lot more versatility in what we do in our transformations

In [12]:
from pyspark.sql import functions as F

In [13]:
# adds a column by converting a datetime col to just date
df.withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select('pickup_date',
            'dropoff_date', 
            'PULocationID', 
            'DOLocationID',) \
    .show()

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2021-01-12|  2021-01-12|         262|         231|
| 2021-01-05|  2021-01-05|          61|         181|
| 2021-01-02|  2021-01-02|         100|           1|
| 2021-02-01|  2021-02-01|         232|           4|
| 2021-01-06|  2021-01-06|         162|           1|
| 2021-01-28|  2021-01-28|          68|          68|
| 2021-01-18|  2021-01-18|         205|         205|
| 2021-01-30|  2021-01-30|         256|         255|
| 2021-01-16|  2021-01-16|          89|          91|
| 2021-01-05|  2021-01-05|         132|         102|
| 2021-01-11|  2021-01-11|          97|          61|
| 2021-01-22|  2021-01-22|          79|          37|
| 2021-01-03|  2021-01-03|          26|         178|
| 2021-01-15|  2021-01-15|         181|         198|
| 2021-01-09|  2021-01-09|          76|          91|
| 2021-01-15|  2021-01-15|         246|       

There's a lot of pre-defined features in `pyspark.sql.functions` already, but the real strength is in UDFs

In [14]:
# user-defined func
def not_for_sql(base_num: str) -> str:
    """Takes basenum and prefix with a letter
    based on the numeric portion's divisibility
    and appends the hex value to return as string
    """
    # base_num format: [A-Z][some_numbers]
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'b/{num:03x}'

In [15]:
not_for_sql('B3910')

'b/f46'

In [16]:
from pyspark.sql import types

In [17]:
not_sql_udf = F.udf(not_for_sql, returnType=types.StringType())

In [19]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('crazy_base', not_sql_udf(df.dispatching_base_num)) \
    .select('crazy_base',
            'pickup_date',
            'dropoff_date', 
            'PULocationID', 
            'DOLocationID',) \
    .show()

[Stage 3:>                                                          (0 + 1) / 1]

+----------+-----------+------------+------------+------------+
|crazy_base|pickup_date|dropoff_date|PULocationID|DOLocationID|
+----------+-----------+------------+------------+------------+
|     b/acc| 2021-01-12|  2021-01-12|         262|         231|
|     b/a39| 2021-01-05|  2021-01-05|          61|         181|
|     b/9ce| 2021-01-02|  2021-01-02|         100|           1|
|     b/b42| 2021-02-01|  2021-02-01|         232|           4|
|     s/af0| 2021-01-06|  2021-01-06|         162|           1|
|     a/b43| 2021-01-28|  2021-01-28|          68|          68|
|     b/9ce| 2021-01-18|  2021-01-18|         205|         205|
|     b/b35| 2021-01-30|  2021-01-30|         256|         255|
|     b/b3b| 2021-01-16|  2021-01-16|          89|          91|
|     b/9ce| 2021-01-05|  2021-01-05|         132|         102|
|     b/acc| 2021-01-11|  2021-01-11|          97|          61|
|     b/9ce| 2021-01-22|  2021-01-22|          79|          37|
|     b/b32| 2021-01-03|  2021-01-03|   

                                                                                