# Week 5 - homework

## 1. Spark.version

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.sql import types
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/28 14:19:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.version

'3.3.1'

## 2. HVFHW june 2021

Output partition size?

Download raw dataset and repartition using `cast_partition.py`:

```bash
wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz
python cast_partition.py \
    --taxi_type fhvhv \
    --year 2021 \
    --month 6 \
    --num_partitions 12
```

24 MB per partition x 12 partitions

## 3. Count

Records on june 15th?

In [5]:
taxi_type = "fhvhv"
year = 2021
month = 6
parts_dir = Path("../data/taxi_ingest_data/parts/")
fpath = parts_dir / taxi_type / f"{year}" / f"{month:02d}"
df = spark.read.parquet(str(fpath))
df.createOrReplaceTempView("fhvhv_data")

                                                                                

In [6]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: float (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [9]:
spark.sql(
    """
    SELECT
        count(1)
    FROM
        fhvhv_data
    WHERE
        CAST(pickup_datetime AS DATE) = "2021-06-15"
        """
).show()

[Stage 1:>                                                          (0 + 4) / 4]

+--------+
|count(1)|
+--------+
|  452470|
+--------+



                                                                                

## 4. Longest trip in hours

In [45]:
spark.sql(
    """
    WITH duration as(
        SELECT
            (dropoff_datetime - pickup_datetime) AS trip_duration,
            pickup_datetime
        FROM
            fhvhv_data
        ) 
    SELECT
        EXTRACT(hour from trip_duration) + EXTRACT(day FROM trip_duration) * 24 + EXTRACT(minute FROM trip_duration)/60 AS hours ,
        pickup_datetime
    FROM
        duration
    ORDER BY hours DESC
    LIMIT 10
    """
).show()

[Stage 17:>                                                         (0 + 4) / 4]

+------------------+-------------------+
|             hours|    pickup_datetime|
+------------------+-------------------+
| 66.86666666666666|2021-06-25 13:55:41|
|25.533333333333335|2021-06-22 12:09:45|
|19.966666666666665|2021-06-27 10:32:29|
|18.183333333333334|2021-06-26 22:37:11|
|16.466666666666665|2021-06-23 20:40:43|
|14.266666666666667|2021-06-23 22:03:31|
|              13.9|2021-06-24 23:11:00|
|11.666666666666666|2021-06-04 20:56:02|
|             11.35|2021-06-27 07:45:19|
|10.983333333333333|2021-06-20 17:05:12|
+------------------+-------------------+



                                                                                

In [40]:
spark.sql(
    """
    SELECT EXTRACT(day FROM (timestamp '2020-1-20' - timestamp '2020-1-2'))*24+EXTRACT(hour FROM (timestamp '2020-1-20' - timestamp '2020-1-2'))
    """
).show()

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|((extract(day FROM (TIMESTAMP '2020-01-20 00:00:00' - TIMESTAMP '2020-01-02 00:00:00')) * 24) + extract(hour FROM (TIMESTAMP '2020-01-20 00:00:00' - TIMESTAMP '2020-01-02 00:00:00')))|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                    432|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



## 5. Web UI Port

default: 4040

## 6. Most frequent pickup zone

Join with `taxi_zone_lookup.csv`

In [49]:
file_zone = parts_dir / "../taxi_zone_lookup.csv"
df_zone = spark.read \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .csv(str(file_zone))
df_zone.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [50]:
df_zone.createOrReplaceTempView('zones')

In [59]:
df \
    .select('PULocationID') \
    .groupBy('PULocationID') \
    .count() \
    .orderBy('count', ascending=False) \
    .limit(1) \
    .join(
        df_zone, df.PULocationID == df_zone.LocationID, 'inner'
    ) \
    .drop('LocationID') \
    .show()

+------------+------+--------+-------------------+------------+
|PULocationID| count| Borough|               Zone|service_zone|
+------------+------+--------+-------------------+------------+
|          61|231279|Brooklyn|Crown Heights North|   Boro Zone|
+------------+------+--------+-------------------+------------+



Solution in SQL:

In [63]:
spark.sql(
    """
    with hot as (
        SELECT
            PULocationID,
            count(1) as count
        FROM
            fhvhv_data
        GROUP BY
            PULocationID
        ORDER BY
            count DESC
        LIMIT 5    
    )
    SELECT *
    FROM hot
    INNER JOIN
    zones
    ON hot.PULocationID = zones.LocationID
    """
).show()



+------------+------+----------+---------+-------------------+------------+
|PULocationID| count|LocationID|  Borough|               Zone|service_zone|
+------------+------+----------+---------+-------------------+------------+
|          37|187929|        37| Brooklyn|     Bushwick South|   Boro Zone|
|          61|231279|        61| Brooklyn|Crown Heights North|   Boro Zone|
|          76|186780|        76| Brooklyn|      East New York|   Boro Zone|
|          79|221244|        79|Manhattan|       East Village| Yellow Zone|
|         132|188867|       132|   Queens|        JFK Airport|    Airports|
+------------+------+----------+---------+-------------------+------------+



                                                                                