In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import Window, functions as F

# Initialize a Spark session with increased memory
spark = (SparkSession.builder
    .appName("CompareShortTrips")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "4g") 
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1") 
    .getOrCreate()
    )

In [12]:
def haversine(lat1_col, lon1_col, lat2_col, lon2_col):
    """
    Calculate the great circle distance in meters between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1 = F.radians(lat1_col)
    lon1 = F.radians(lon1_col)
    lat2 = F.radians(lat2_col)
    lon2 = F.radians(lon2_col)
    
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = F.sin(dlat/2)**2 + F.cos(lat1) * F.cos(lat2) * F.sin(dlon/2)**2
    c = 2 * F.asin(F.sqrt(a)) 
    r = 6371000  # Radius of earth in meters. Use 3956 for miles. Determines return value units.
    return c * r


In [13]:
#get number of trips where the alighting stop id is the trip_next_stop_id
stop_times = spark.read.parquet("../data/02_intermediate/stop_times_avl/stop_times")
stop_times = stop_times.dropDuplicates(["TRIP_ID","STOP_SEQUENCE"])
next_stop_window = Window.partitionBy("TRIP_ID").orderBy(F.col("STOP_SEQUENCE").asc())
stop_times = stop_times.withColumn(
    "STOP_ID_NEXT",
    F.lead(F.col("STOP_ID"),1).over(next_stop_window)
    )
stop_times = stop_times.select(
    F.col("ROUTE_ID_OLD").alias("LINE_ID"),
    F.col("STOP_ID"),
    F.col("STOP_ID_NEXT").alias("TRIP_STOP_ID_NEXT"),
    F.col("DIRECTION_ID"),
).dropDuplicates(["LINE_ID","STOP_ID","DIRECTION_ID"])

In [14]:
journeys = spark.read.parquet("../data/03_primary/rider_events_partitioned")
#get journey profiles
journey_window = Window.partitionBy("CARD_ID","JOURNEY_ID").orderBy(F.col("DATETIME").asc())
card_window = Window.partitionBy("CARD_ID").orderBy(F.col("DATETIME").asc())

#check for single stop rides
single_stop_rides = journeys.withColumn(
    "STOP_ID_NEXT",
    F.lead(F.col("STOP_ID"),1).over(journey_window)
    ).withColumn(
    "EVENT_TYPE_NEXT",
    F.lead(F.col("EVENT_TYPE"),1).over(journey_window)
    ).filter(
        F.col("EVENT_TYPE") == "ORIGIN"
    ).filter(
        F.col("EVENT_TYPE_NEXT") == "DESTINATION"
    ).join(
        stop_times.select("LINE_ID","STOP_ID","DIRECTION_ID","TRIP_STOP_ID_NEXT"),
        on = ["LINE_ID","STOP_ID","DIRECTION_ID"],
        how = "left"
    ).withColumn(
        "IS_SINGLE_STOP_RIDE",
        F.when(F.col("STOP_ID_NEXT")==F.col("TRIP_STOP_ID_NEXT"),1).otherwise(0)
    ).groupBy("JOURNEY_ID").agg(
        F.max(F.col("IS_SINGLE_STOP_RIDE")).alias("IS_SINGLE_STOP_RIDE")
        )
#get journey legs
legs = journeys.filter(
    F.col("EVENT").isin(["BOARDED","ALIGHTED"])
    ).withColumn(
        "STOP_LAT_NEXT",
        F.lead(F.col("STOP_LAT"),1).over(journey_window)
    ).withColumn(
        "STOP_LON_NEXT",
        F.lead(F.col("STOP_LON"),1).over(journey_window)
    ).filter(F.col("STOP_LAT_NEXT").isNotNull()
    ).withColumn(
        "LEG_DISTANCE_METERS",
        haversine(
            F.col("STOP_LAT"),
            F.col("STOP_LON"),
            F.col("STOP_LAT_NEXT"),
            F.col("STOP_LON_NEXT")
            )
    ).filter(
        F.col("EVENT")=="BOARDED"
    ).groupBy("JOURNEY_ID").agg(
        F.min("LEG_DISTANCE_METERS").alias("LEG_DISTANCE_METERS_MIN"),
        F.max("LEG_DISTANCE_METERS").alias("LEG_DISTANCE_METERS_MAX"),
        F.sum("LEG_DISTANCE_METERS").alias("TOTAL_JOURNEY_DISTANCE_TRAVELED_METERS")
    )
#get number of legs
n_legs = journeys.filter(
    F.col("EVENT").isin(["BOARDED"])
    ).groupBy(["JOURNEY_ID"]).agg(F.count("*").alias("N_LEGS"))
#get distance between alighting location and next boarding
card_id = Window.partitionBy("CARD_ID").orderBy(F.col("DATETIME").asc())
distances_from_alighting_to_next_boarding = journeys.filter(
    F.col("EVENT").isin(["BOARDED","ALIGHTED"])
    ).withColumn(
        "STOP_LAT_NEXT",
        F.lead(F.col("STOP_LAT"),1).over(journey_window)
    ).withColumn(
        "STOP_LON_NEXT",
        F.lead(F.col("STOP_LON"),1).over(journey_window)
    ).filter(F.col("STOP_LAT_NEXT").isNotNull()
    ).withColumn(
        "ALIGHTING_TO_NEXT_BOARDING_DISTANCE_METERS",
        haversine(
            F.col("STOP_LAT"),
            F.col("STOP_LON"),
            F.col("STOP_LAT_NEXT"),
            F.col("STOP_LON_NEXT")
            )
    ).filter(
        F.col("EVENT")=="ALIGHTED"
    ).groupBy("JOURNEY_ID").agg(
        F.max(
            F.col("ALIGHTING_TO_NEXT_BOARDING_DISTANCE_METERS")
        ).alias("ALIGHTING_TO_NEXT_BOARDING_DISTANCE_METERS_MAX"),
        F.min(
            F.col("ALIGHTING_TO_NEXT_BOARDING_DISTANCE_METERS")
        ).alias("ALIGHTING_TO_NEXT_BOARDING_DISTANCE_METERS_MIN")
)
#lines used
lines_used = journeys.select(
    "JOURNEY_ID",
    "LINE_ID"
    ).distinct().groupBy("JOURNEY_ID").agg(F.collect_list(F.col("LINE_ID")).alias("LINE_IDS_USED"))
lines_used = lines_used.withColumn("INCLUDES_MAX",F.when(
    F.array_contains(F.col("LINE_IDS_USED"),200) | 
    F.array_contains(F.col("LINE_IDS_USED"),100) | 
    F.array_contains(F.col("LINE_IDS_USED"),190) | 
    F.array_contains(F.col("LINE_IDS_USED"),90)  |
    F.array_contains(F.col("LINE_IDS_USED"),290) ,
    1
    ).otherwise(0)
    )

#confidences
confidences = journeys.filter(F.col("EVENT") == "ALIGHTED").groupBy(["JOURNEY_ID"]).agg(
    F.min(F.col("CONFIDENCE")).alias("CONFIDENCE_MIN"),
    F.max(F.col("CONFIDENCE")).alias("CONFIDENCE_MAX"),
    F.mean(F.col("CONFIDENCE")).alias("CONFIDENCE_MEAN"),
    )

#get distance from origin boarding to destination alighting
distances_from_origin_to_destination = journeys.filter(
    F.col("EVENT_TYPE").isin(["ORIGIN","DESTINATION"])
    ).withColumn(
        "STOP_LAT_NEXT",
        F.lead(F.col("STOP_LAT"),1).over(journey_window)
    ).withColumn(
        "STOP_LON_NEXT",
        F.lead(F.col("STOP_LON"),1).over(journey_window)
    ).filter(
        F.col("STOP_LAT_NEXT").isNotNull()
    ).filter(
        F.col("EVENT_TYPE")=="ORIGIN"
    ).withColumn(
        "ORIGIN_DESTINATION_DISTANCE_METERS",
        haversine(
            F.col("STOP_LAT"),
            F.col("STOP_LON"),
            F.col("STOP_LAT_NEXT"),
            F.col("STOP_LON_NEXT")
            )
    ).select("JOURNEY_ID","ORIGIN_DESTINATION_DISTANCE_METERS")
#get distances from origin to origin
distances_from_origin_to_origin = journeys.filter(
    F.col("EVENT_TYPE").isin(["ORIGIN"])
    ).withColumn(
        "STOP_LAT_NEXT",
        F.lead(F.col("STOP_LAT"),1).over(card_window)
    ).withColumn(
        "STOP_LON_NEXT",
        F.lead(F.col("STOP_LON"),1).over(card_window)
    ).filter(
        F.col("STOP_LAT_NEXT").isNotNull()
    ).filter(
        F.col("EVENT_TYPE")=="ORIGIN"
    ).withColumn(
        "ORIGIN_ORIGIN_DISTANCE_METERS",
        haversine(
            F.col("STOP_LAT"),
            F.col("STOP_LON"),
            F.col("STOP_LAT_NEXT"),
            F.col("STOP_LON_NEXT")
            )
    ).select("JOURNEY_ID","ORIGIN_ORIGIN_DISTANCE_METERS")
#get inter boarding times
inter_boarding_times = journeys.filter(
    F.col("EVENT") == "BOARDED"
    ).withColumn(
        "INTER_BOARDING_TIME_SECONDS",
        F.lead(F.col("DATETIME"),1).over(card_window).cast("long") - F.col("DATETIME").cast("long")
    ).groupBy("JOURNEY_ID").agg(
        F.max("INTER_BOARDING_TIME_SECONDS").alias("MAX_INTER_BOARDING_TIME_SECONDS")
        )
#get journey times
journey_times = journeys.filter(
    F.col("EVENT_TYPE") == "ORIGIN").select(
        "JOURNEY_ID",
        F.col("DATETIME").alias("ORIGIN_DATETIME")
    ).join(
    journeys.filter(
        F.col("EVENT_TYPE") == "DESTINATION"
        ).select("JOURNEY_ID",F.col("DATETIME").alias("DESTINATION_DATETIME")
                 ),
        on = ["JOURNEY_ID"]
).withColumn(
    "JOURNEY_TIME_SECONDS",
    F.col("DESTINATION_DATETIME").cast("long")-F.col("ORIGIN_DATETIME").cast("long")
).select("JOURNEY_ID","JOURNEY_TIME_SECONDS")
#journey profiles
journey_profiles = journeys.select(
    "JOURNEY_ID",
    "CARD_ID",
    "VALIDITY_SCORE"
    ).distinct().join(
    distances_from_alighting_to_next_boarding,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    n_legs,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    legs,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    distances_from_origin_to_destination,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    lines_used,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    confidences,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    distances_from_origin_to_origin,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    single_stop_rides,
    on = ["JOURNEY_ID"],
    how = "left"
).withColumn(
    "IS_SINGLE_STOP_RIDE",
    F.when(F.col("IS_SINGLE_STOP_RIDE") == 1,1).otherwise(0)
).join(
    inter_boarding_times,
    on = ["JOURNEY_ID"],
    how = "left"
).join(
    journey_times,
    on = ["JOURNEY_ID"],
    how = "left"

)
journey_profiles.write.mode("overwrite").parquet("journey_profiles/")

                                                                                

In [15]:
single_stop_rides.show()



+--------------------+-------------------+
|          JOURNEY_ID|IS_SINGLE_STOP_RIDE|
+--------------------+-------------------+
|0a6824dd50e6271cc...|                  0|
|91becc0e3034ba484...|                  1|
|da4fb790ed6054f9a...|                  0|
|7ad2c39c6110afd16...|                  0|
|bc10c2cc60fe6e36a...|                  0|
|ab3d7e6d4e875d9a5...|                  0|
|0cc5518fc674ce956...|                  0|
|e3b664ffe3bd81b95...|                  0|
|45eeab6e4c047e29b...|                  0|
|5f479383a69655aab...|                  0|
|c5587dc40535370ef...|                  0|
|591a6f9bcee9c1129...|                  0|
|e33b590d8b56efbb5...|                  0|
|dc85490b89b3282d3...|                  0|
|fbef763e9fd64be41...|                  0|
|c8e24f3b40df825ef...|                  0|
|c5b4e3655415e683b...|                  0|
|a5d037c61f4850bf5...|                  0|
|52afd99c78ab9abf2...|                  0|
|97bfcc31f90a9f0d4...|                  0|
+----------

                                                                                

In [10]:
journeys.select("EVENT_TYPE").distinct().show()

+-----------+
| EVENT_TYPE|
+-----------+
|DESTINATION|
|MID_JOURNEY|
|     ORIGIN|
+-----------+



                                                                                

In [16]:
journeys.filter(F.col("STOP_ID").isin([
    13066,
    13067,
    13068,
    13069,
    13070,
    ])).show()

+----------+-------------------------+--------+-------+-------+-------+--------+--------+------------+-----+----------+----------+-------+------------------+
|JOURNEY_ID|FARE_CATEGORY_DESCRIPTION|DATETIME|STOP_ID|CARD_ID|LINE_ID|STOP_LAT|STOP_LON|DIRECTION_ID|EVENT|CONFIDENCE|EVENT_TYPE|IS_LOOP|JOURNEY_START_DATE|
+----------+-------------------------+--------+-------+-------+-------+--------+--------+------------+-----+----------+----------+-------+------------------+
+----------+-------------------------+--------+-------+-------+-------+--------+--------+------------+-----+----------+----------+-------+------------------+

