# Trip Analysis

Tasks:
- Evaluate average trip durations and distances.
- Analyze these metrics by different times of day, days of the week, and months to uncover patterns.
- Identify the top 10 pickup and drop-off locations.


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Trip Analysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/21 00:04:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load the processed data

In [2]:
TRAIN_PROCESSED = "../../data/processed/train_processed.parquet"

df = spark.read.parquet(TRAIN_PROCESSED)

                                                                                

## Evaluate Average Trip Durations and Distances

In [3]:
from pyspark.sql.functions import avg

# Calculate average trip duration and distance
avg_trip_stats = df.select(
    avg("trip_duration").alias("average_trip_duration"),
    avg("trip_distance_km").alias("average_trip_distance_km")
)
avg_trip_stats.show()

                                                                                

+---------------------+------------------------+
|average_trip_duration|average_trip_distance_km|
+---------------------+------------------------+
|    959.4922729603659|       3.440863902010865|
+---------------------+------------------------+



In [6]:
from pyspark.sql.functions import hour, dayofweek, month

# Add time columns (if not already added in previous feature engineering eda notebook)
df = df.withColumn("hour", hour("pickup_datetime"))
df = df.withColumn("day_of_week", dayofweek("pickup_datetime"))
df = df.withColumn("month", month("pickup_datetime"))

# Cache the df
df.cache()

# Trigger caching with an action
df.count()

# Group by new time columns and calculate averages
time_analysis = df.groupBy("hour", "day_of_week", "month").agg(
    avg("trip_duration").alias("avg_duration"),
    avg("trip_distance_km").alias("avg_distance")
).orderBy("day_of_week", "hour", "month")

time_analysis.show()


[Stage 17:>                                                         (0 + 8) / 8]

+----+-----------+-----+------------------+------------------+
|hour|day_of_week|month|      avg_duration|      avg_distance|
+----+-----------+-----+------------------+------------------+
|   0|          1|    1| 886.5953488372093|3.4501344315200257|
|   0|          1|    2| 869.2132132132132|3.4308650655189044|
|   0|          1|    3| 910.7371512481644|3.4837120887246047|
|   0|          1|    4|1017.5971153846153|3.5646453594082033|
|   0|          1|    5| 885.5396618985695|3.5978548803406696|
|   0|          1|    6| 909.3114840062926|3.5613075183708562|
|   1|          1|    1| 763.2528604118993|3.5783930983861048|
|   1|          1|    2|1054.0252525252524|3.4565260078008913|
|   1|          1|    3| 1038.832881172002| 3.585004270544865|
|   1|          1|    4| 886.9234943027673|  3.57010856369486|
|   1|          1|    5| 1020.668815071889| 3.706051193649675|
|   1|          1|    6| 871.3853718500308|3.6974887099880167|
|   2|          1|    1| 926.6111488783141|3.5897716444

                                                                                

## Identify Top 10 Pickup and Drop-off Locations

In [7]:
from pyspark.sql.functions import col

# Top 10 pickup locations
top_pickup_locations = df.groupBy("pickup_latitude", "pickup_longitude").count().orderBy(col("count").desc()).limit(10)
top_pickup_locations.show()

# Top 10 dropoff locations
top_dropoff_locations = df.groupBy("dropoff_latitude", "dropoff_longitude").count().orderBy(col("count").desc()).limit(10)
top_dropoff_locations.show()


                                                                                

+------------------+------------------+-----+
|   pickup_latitude|  pickup_longitude|count|
+------------------+------------------+-----+
| 40.82100296020508|-73.95466613769531|   39|
| 40.77378845214844|-73.87093353271484|   15|
| 40.77376937866211|-73.87093353271484|   14|
| 40.77376174926758| -73.8708724975586|   14|
|40.773738861083984| -73.8708724975586|   14|
|  40.7741584777832|-73.87303924560547|   14|
|  40.7741813659668|-73.87300872802734|   14|
| 40.77376174926758|-73.87091064453125|   13|
| 40.77381134033203|-73.87095642089844|   13|
| 40.77410888671875|-73.87303161621094|   13|
+------------------+------------------+-----+



[Stage 23:>                                                         (0 + 8) / 8]

+------------------+------------------+-----+
|  dropoff_latitude| dropoff_longitude|count|
+------------------+------------------+-----+
| 40.82100296020508|-73.95466613769531|   39|
|40.750389099121094|-73.99468231201172|   10|
| 40.76057815551758|-74.00276947021484|   10|
| 40.76055145263672|-74.00276947021484|    9|
|40.750370025634766|-73.99466705322266|    8|
| 40.75040817260742|-73.99465942382812|    8|
| 40.75046157836914|-73.99466705322266|    8|
|40.770591735839844|-73.86512756347656|    7|
| 40.75014877319336|-73.99126434326172|    7|
| 40.76839828491211|-73.86177825927734|    7|
+------------------+------------------+-----+



                                                                                

In [8]:
# unpersist the df to free up resources
df.unpersist()

DataFrame[id: string, vendor_id: int, pickup_datetime: timestamp, dropoff_datetime: timestamp, passenger_count: int, pickup_longitude: double, pickup_latitude: double, dropoff_longitude: double, dropoff_latitude: double, store_and_fwd_flag: string, trip_duration: int, pickup_dayofweek: int, pickup_hour: int, pickup_month: int, pickup_year: int, trip_distance_km: double, hour: int, day_of_week: int, month: int]

In [9]:
# Close the current session
spark.stop()