# Fare Analysis

## Part 1. Local Development

### Step 1. Setup the environment

In [1]:
from pyspark.sql import SparkSession

# Create or retrieve a Spark session
spark = SparkSession.builder.appName("Fare Analysis").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/25 11:58:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2. Load the processed data

In [4]:
TRAIN_PROCESSED = "../../data/processed/yellow_tripdata_2024-01.parquet"

df = spark.read.parquet(TRAIN_PROCESSED)

In [5]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



### Step 3: Calculate Average Fares by Pickup and Drop-off Locations


In [6]:
from pyspark.sql.functions import col, avg

avg_fares_location = df.groupBy("PULocationID", "DOLocationID").agg(avg("fare_amount").alias("avg_fare"))
avg_fares_location.show()



                                                                                

+------------+------------+------------------+
|PULocationID|DOLocationID|          avg_fare|
+------------+------------+------------------+
|         236|         238|10.041376664056385|
|         148|         229|17.747352245862885|
|         107|         161|13.325880842321519|
|         229|         239| 17.84373188405798|
|         231|         140|31.463979933110366|
|         163|         263|14.274155279503104|
|         148|         146|28.083333333333332|
|         163|           7|22.235064935064933|
|         151|         116|15.101366906474821|
|          75|          97| 50.36666666666667|
|         114|         151|30.727027027027034|
|         231|          41| 39.28328358208955|
|         232|          45|10.641904761904764|
|         116|         229|32.286249999999995|
|         132|         107| 68.83601078167115|
|         264|         107|14.731111111111112|
|         229|          36|              33.9|
|          43|           7| 24.35576923076923|
|          49

### Step 3: Calculate Average Fares by Passenger Count

In [7]:
avg_fares_passenger = df.groupBy("passenger_count").agg(avg("fare_amount").alias("avg_fare"))
avg_fares_passenger.show()




+---------------+------------------+
|passenger_count|          avg_fare|
+---------------+------------------+
|              0|17.075336405529956|
|              7|45.411249999999995|
|              6|17.228569319554442|
|              5|17.511869814361592|
|              1|17.557051804714995|
|              3|20.041298568955217|
|              8| 81.39098039215688|
|              2|20.171285105269558|
|              4|21.833799014892033|
|              9|              11.4|
|           NULL|20.016193904200065|
+---------------+------------------+



                                                                                

### Step 4. Explore Correlations Between Fare Amounts and Trip Distances

In [8]:
from pyspark.sql.functions import corr

fare_distance_correlation = df.select(corr("fare_amount", "trip_distance").alias("correlation"))
fare_distance_correlation.show()




+--------------------+
|         correlation|
+--------------------+
|0.016064980070449663|
+--------------------+



                                                                                

### Step 5. Save and Visualize the Results

In [11]:
# To save the results
ANALYSED_DATA_DIR = "../../results/analysed_data/"

avg_fares_location.write.format("parquet").save(f"{ANALYSED_DATA_DIR}save_avg_fares_location.parquet")
avg_fares_passenger.write.format("parquet").save(f"{ANALYSED_DATA_DIR}save_avg_fares_passenger.parquet")


                                                                                

## Part 2. Running on GCP

In [12]:
# Set bucket name
bucket_name = "spbd-nyc-taxi-bucket"

# Upload the Python script
!gsutil cp ./fare_analysis.py gs://{bucket_name}/scripts/

# Upload processed data
!gsutil -m cp -r ../../data/processed/yellow_tripdata_2024-01.parquet gs://{bucket_name}/data/processed

Copying file://./fare_analysis.py [Content-Type=text/x-python]...
/ [1 files][  1.5 KiB/  1.5 KiB]                                                
Operation completed over 1 objects/1.5 KiB.                                      
If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

Copying file://../../data/processed/yellow_tripdata_2024-01.parquet [Content-Type=application/octet-stream]...
\ [1/1 files][ 47.6 MiB/ 47.6 MiB] 100% Done                                    
Operation completed over 1 objects/47.6 MiB.                                     


In [13]:
cluster_name = "spbd-nyc-taxi-cluster"
region = "europe-west9"
machine_type="n2-standard-2"

!gcloud dataproc clusters create {cluster_name} \
    --region={region} \
    --zone={region}-a \
    --master-machine-type={machine_type} \
    --worker-machine-type={machine_type} \
    --num-workers=2 \
    --image-version=2.0-debian10 \
    --scopes=default


Waiting on operation [projects/epita-spbd-nyc-da/regions/europe-west9/operations/79225936-e5f7-3e92-98cd-6b458a09d76c].
Waiting for cluster creation operation...                                      
Waiting for cluster creation operation...done.                                 
Created [https://dataproc.googleapis.com/v1/projects/epita-spbd-nyc-da/regions/europe-west9/clusters/spbd-nyc-taxi-cluster] Cluster placed in zone [europe-west9-a].


In [14]:
!gcloud dataproc jobs submit pyspark \
    gs://spbd-nyc-taxi-bucket/scripts/fare_analysis.py \
    --cluster={cluster_name} \
    --region={region}

Job [d15df5b9e4684d76afc7b7ec6b5534ea] submitted.
Waiting for job output...
24/05/25 11:38:53 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/05/25 11:38:53 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/05/25 11:38:53 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/25 11:38:54 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator
24/05/25 11:38:54 INFO org.sparkproject.jetty.util.log: Logging initialized @4352ms to org.sparkproject.jetty.util.log.Slf4jLog
24/05/25 11:38:54 INFO org.sparkproject.jetty.server.Server: jetty-9.4.40.v20210413; built: 2021-04-13T20:42:42.668Z; git: b881a572662e1943a14ae12e7e1207989f218b74; jvm 1.8.0_412-b08
24/05/25 11:38:54 INFO org.sparkproject.jetty.server.Server: Started @4479ms
24/05/25 11:38:54 INFO org.sparkproject.jetty.server.AbstractConnector: Started ServerConnector@a1a67f3{HTTP/1.1, (http/1.1)}{0.0.0.0:37269}
24/05/25 11:38:55 INFO org.apache.hadoop.yarn.client.RMProx

In [15]:
!gcloud dataproc jobs list --cluster=spbd-nyc-taxi-cluster --region=europe-west9

JOB_ID                            TYPE     STATUS
d15df5b9e4684d76afc7b7ec6b5534ea  pyspark  DONE


In [16]:
!gcloud dataproc clusters delete spbd-nyc-taxi-cluster --region=europe-west9 --quiet

Waiting on operation [projects/epita-spbd-nyc-da/regions/europe-west9/operations/39f55afd-d2ae-306a-8858-61393d54b6c8].
Waiting for cluster deletion operation...done.                                 
Deleted [https://dataproc.googleapis.com/v1/projects/epita-spbd-nyc-da/regions/europe-west9/clusters/spbd-nyc-taxi-cluster].
