# PySpark Analysis NYC TLC Trips Records Data Feb 2021
---
<sub>Muhammad Difagama Ivanka</sub>

## Start

In [1]:
import numpy as np
np.random.seed(233)
import pandas as pd
import pyspark
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql import functions as F



In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('nyc_spark') \
    .getOrCreate()

In [7]:
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet > yellow_tripdata_2021-02.parquet
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet > green_tripdata_2021-02.parquet
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2021-02.parquet > fhv_tripdata_2021-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 20.7M    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  1 20.7M    1  326k    0     0   140k      0  0:02:31  0:00:02  0:02:29  140k
 12 20.7M   12 2737k    0     0   841k      0  0:00:25  0:00:03  0:00:22  842k
 17 20.7M   17 3655k    0     0   859k      0  0:00:24  0:00:04  0:00:20  859k
 17 20.7M   17 3767k    0     0   704k      0  0:00:30  0:00:05  0:00:25  760k
 19 20.7M   19 4068k    0     0   650k      0  0:00:32  0:00:06  0:00:26  829k
 20 20.7M   20 4416k    0     0   605k      0  0:00:35  0:00:07  0:00:28  821k
 22 20.7M   22 4832k    0     0   579k      0  0:00:36  0:00:08  0:00:28  411k
 25 20.7M   25 5326k    0     0   575k      0  0:00

In [8]:
df_yellow = spark.read.parquet('yellow_tripdata_2021-02.parquet')
df_green = spark.read.parquet('green_tripdata_2021-02.parquet')
df_fhv = spark.read.parquet('fhv_tripdata_2021-02.parquet')

In [21]:
print(df_yellow, "\n")
print(df_green, "\n")
print(df_fhv)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double] 

DataFrame[VendorID: bigint, lpep_pickup_datetime: timestamp, lpep_dropoff_datetime: timestamp, store_and_fwd_flag: string, RatecodeID: double, PULocationID: bigint, DOLocationID: bigint, passenger_count: double, trip_distance: double, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, ehail_fee: int, improvement_surcharge: double, total_amount: double, payment_type: double, trip_type: double, congestion_surcharge: double] 

DataFrame[dispatching_base_num: string, pickup_datetime: timestamp, dr

## 1. How many taxi trips were there on February 15?

In [24]:
def total_trips_cnt(the_df, pickup_time_col):
    trips_cnt = the_df.where((the_df[pickup_time_col] >= "2021-02-15 00:00:00")
    & (the_df[pickup_time_col] < "2021-02-16 00:00:00")).count()
    return trips_cnt

yel_cnt = total_trips_cnt(df_yellow, 'tpep_pickup_datetime')
grn_cnt = total_trips_cnt(df_green, 'lpep_pickup_datetime')
fhv_cnt = total_trips_cnt(df_fhv, 'pickup_datetime')

print(f"Yellow Taxi Trips on 15 February 2021\t\t\t: {yel_cnt}")
print(f"Green Taxi Trips on 15 February 2021\t\t\t: {grn_cnt}")
print(f"For-Hire Vehicle (FHV) Taxi Trips on 15 February 2021\t: {fhv_cnt}")
print(f"All Taxi Total Trips on 15 February 2021\t\t: {np.sum([yel_cnt,grn_cnt,fhv_cnt])}")

Yellow Taxi Trips on 15 February 2021			: 43734
Green Taxi Trips on 15 February 2021			: 1798
For-Hire Vehicle (FHV) Taxi Trips on 15 February 2021	: 35523
All Taxi Total Trips on 15 February 2021		: 81055


## 2. The longest trip for each day

In [22]:
# Duration (minutes)
df_yellow.withColumn("trip_duration", (df_yellow.tpep_pickup_datetime)).show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00013|2021-02-01 08:01:00|2021-02-01 09:33:00|        null|        null|   null|                B00014|
|     B00021         |2021-02-01 08:55:40|2021-02-01 09:06:20|       173.0|        82.0|   null|       B00021         |
|     B00021         |2021-02-01 08:14:03|2021-02-01 08:28:37|       173.0|        56.0|   null|       B00021         |
|     B00021         |2021-02-01 08:27:48|2021-02-01 08:35:45|        82.0|       129.0|   null|       B00021         |
|              B00037|2021-02-01 08:12:50|2021-02-01 08:26:38|        null|       225.0|   null|                B00037|
|              B00037|2021-02-01 08:00:3

## 3. Top 5 Most frequent `dispatching_base_num`

## 4. Top 5 Most common location pairs (PUlocationID and DOlocationID)