In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("CompareShortTrips") \
    .getOrCreate()

# Set the logging level to ERROR
spark.sparkContext.setLogLevel("ERROR")


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/09 11:36:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
#build next stop id dataframe
from pyspark.sql import Window
from pyspark.sql import functions as F

def haversine(lat1_col, lon1_col, lat2_col, lon2_col):
    """
    Calculate the great circle distance in meters between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1 = F.radians(lat1_col)
    lon1 = F.radians(lon1_col)
    lat2 = F.radians(lat2_col)
    lon2 = F.radians(lon2_col)
    
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = F.sin(dlat/2)**2 + F.cos(lat1) * F.cos(lat2) * F.sin(dlon/2)**2
    c = 2 * F.asin(F.sqrt(a)) 
    r = 6371000  # Radius of earth in meters. Use 3956 for miles. Determines return value units.
    return c * r

def get_percentile_columns(column,tag,step):
    """
    This function takes a Spark DataFrame column and returns a list of columns representing
    the percentiles from 0 to 100 in 5 percent increments.
    """
    percentiles = [i for i in range(0, 100+step,step)]
    return [F.percentile_approx(column,p/100).alias(f"PERCENTILE_{p}_{tag}") for p in percentiles]

def prepare_journeys(journeys,stop_times):
    alighting_stop_window = Window.partitionBy("CARD_ID").orderBy(F.col("DATETIME").asc())
    journeys = journeys.join(stop_times.dropDuplicates(),on = ["LINE_ID","STOP_ID"],how = "left")
    journeys = journeys.dropDuplicates(["CARD_ID","JOURNEY_ID","DATETIME","EVENT"])
    journeys = journeys.withColumn(
        "EVENT_NEXT_NEXT",
        F.lead(F.col("EVENT"),2).over(alighting_stop_window)
        ).withColumn(
        "EVENT_TYPE_NEXT_NEXT",
        F.lead(F.col("EVENT_TYPE"),2).over(alighting_stop_window)
        )
    journeys = journeys.withColumn(
        "STOP_ID_NEXT",
        F.lead(F.col("STOP_ID"),1).over(alighting_stop_window)
        ).withColumn(
        "STOP_LAT_NEXT",
        F.lead(F.col("STOP_LAT"),1).over(alighting_stop_window)
        ).withColumn(
        "STOP_LON_NEXT",
        F.lead(F.col("STOP_LON"),1).over(alighting_stop_window)
        ).withColumn(
        "STOP_ID_NEXT_NEXT",
        F.lead(F.col("STOP_ID"),2).over(alighting_stop_window)
        ).withColumn(
        "STOP_LAT_NEXT_NEXT",
        F.lead(F.col("STOP_LAT"),2).over(alighting_stop_window)
        ).withColumn(
        "STOP_LON_NEXT_NEXT",
        F.lead(F.col("STOP_LON"),2).over(alighting_stop_window)
        ).withColumn(
        "DATETIME_NEXT_NEXT",
        F.lead(F.col("DATETIME"),2).over(alighting_stop_window)
        ).withColumn(
            "CONFIDENCE_NEXT",
            F.lead(F.col("CONFIDENCE"),1).over(alighting_stop_window)
        )
    journeys = journeys.withColumn("DISTANCE_TO_NEXT_STOP",haversine(
        F.col("STOP_LAT"),
        F.col("STOP_LON"),
        F.col("STOP_LAT_NEXT"),
        F.col("STOP_LON_NEXT"))
    )
    journeys = journeys.withColumn("DISTANCE_TO_NEXT_NEXT_STOP",haversine(
        F.col("STOP_LAT"),
        F.col("STOP_LON"),
        F.col("STOP_LAT_NEXT_NEXT"),
        F.col("STOP_LON_NEXT_NEXT"))
    )
    journeys = journeys.withColumn("ALIGHTING_DISTANCE_TO_NEXT_STOP",haversine(
        F.col("STOP_LAT_NEXT"),
        F.col("STOP_LON_NEXT"),
        F.col("STOP_LAT_NEXT_NEXT"),
        F.col("STOP_LON_NEXT_NEXT"))
    )
    journeys = journeys.withColumn(
        "TIME_TO_NEXT_ORIGIN",
        F.datediff(F.col("DATETIME_NEXT_NEXT"),
                F.col("DATETIME")
                ))
    return journeys

def analyze_single_leg_trips(journeys):
    single_leg_trips = journeys.filter(F.col("EVENT_TYPE") == "ORIGIN").filter(F.col("EVENT_TYPE_NEXT_NEXT") == "ORIGIN")
    single_leg_trips = single_leg_trips.withColumn(
    "IS_NULL_JOURNEY",
    F.when(
        F.col("STOP_ID")==F.col("STOP_ID_NEXT_NEXT"),
        F.lit("Y")
    ).otherwise(F.lit("N"))).withColumn(
    "IS_SINGLE_STOP_RIDE",
    F.when(
        F.col("STOP_ID_NEXT")==F.col("TRIP_STOP_ID_NEXT"),
        F.lit("Y")
    ).otherwise(F.lit("N"))
    ).withColumn("ALIGHT_AT_NEXT_START_LOCATION",
         F.when(F.col("STOP_ID_NEXT")==F.col("STOP_ID_NEXT_NEXT"),
        F.lit("Y")
    ).otherwise(F.lit("N")))
    total_count = single_leg_trips.select(F.count("*").alias("TOTAL_JOURNEYS"))
    single_leg_trips_analysis = single_leg_trips.groupBy(
        "IS_SINGLE_STOP_RIDE",
        "ALIGHT_AT_NEXT_START_LOCATION",
        ).agg(
    F.count("*").alias("N_JOURNEYS"),
    *[
        3.3*F.percentile_approx("ALIGHTING_DISTANCE_TO_NEXT_STOP",p/100).alias(
        f"PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_{p}").cast("int")
        for p in range(0,120,20)
        ]
    # *[
    #     3.3*F.percentile_approx("DISTANCE_TO_NEXT_STOP",p/100).alias(
    #     f"PERCENTILE_RIDE_DISTANCE_{p}").cast("int")
    #     for p in range(0,120,20)
    #     ]
    
    )
    single_leg_trips_analysis = single_leg_trips_analysis.join(total_count)
    single_leg_trips_analysis = single_leg_trips_analysis.withColumn(
        "PERCENT_JOURNEYS",
        (100*F.col("N_JOURNEYS")/F.col("TOTAL_JOURNEYS")).cast("float")).drop("TOTAL_JOURNEYS")
    return single_leg_trips_analysis,single_leg_trips


In [7]:
stop_times = spark.read.parquet("../data/02_intermediate/stop_times_avl/stop_times")
stop_times = stop_times.dropDuplicates(["TRIP_ID","STOP_SEQUENCE"])
next_stop_window = Window.partitionBy("TRIP_ID").orderBy(F.col("STOP_SEQUENCE").asc())
stop_times = stop_times.withColumn(
    "STOP_ID_NEXT",
    F.lead(F.col("STOP_ID"),1).over(next_stop_window)
    )
stop_times = stop_times.select(
    F.col("ROUTE_ID_OLD").alias("LINE_ID"),
    F.col("STOP_ID"),
    F.col("STOP_ID_NEXT").alias("TRIP_STOP_ID_NEXT"),
    F.col("DIRECTION_ID"),
).dropDuplicates(["LINE_ID","STOP_ID","DIRECTION_ID"])
stop_times = stop_times.withColumn(
    "IS_MAX",
    F.when(F.col("LINE_ID").isin([200,190,290,90,100]),"Y").otherwise("N")
    ).cache()
n_lines_per_stop = stop_times.filter(
    F.col("IS_MAX") == "Y").select("STOP_ID","LINE_ID").groupBy("STOP_ID").agg(
    F.countDistinct(F.col("LINE_ID")).alias("N_MAX_LINES")
    ).cache()
control_journeys = spark.read.parquet("../data/control_run_2/rider_events_partitioned")
control_journeys = prepare_journeys(control_journeys,stop_times)
control_journeys = control_journeys.withColumn(
    "IS_MAX",
    F.when(F.col("LINE_ID").isin([200,190,290,90,100]),"Y").otherwise("N")
    ).cache()
single_leg_control_journeys_analysis,single_leg_control_journeys = analyze_single_leg_trips(control_journeys.filter(F.col("IS_MAX")=="Y"))
fixed_journeys = spark.read.parquet("../data/03_primary/rider_events_partitioned")
fixed_journeys = fixed_journeys.join(control_journeys.select("CARD_ID").distinct(),on = ["CARD_ID"],how = "right")
fixed_journeys = prepare_journeys(fixed_journeys,stop_times)
fixed_journeys = fixed_journeys.withColumn(
    "IS_MAX",
    F.when(F.col("LINE_ID").isin([200,190,290,90,100]),"Y").otherwise("N")
    ).cache()
single_leg_fixed_journeys_analysis,single_leg_fixed_journeys = analyze_single_leg_trips(fixed_journeys.filter(F.col("IS_MAX")=="Y"))


In [8]:
single_leg_control_journeys_analysis.toPandas().sort_values(
    by = ["IS_SINGLE_STOP_RIDE","ALIGHT_AT_NEXT_START_LOCATION"]
    ).T

                                                                                

Unnamed: 0,3,2,1,0
IS_SINGLE_STOP_RIDE,N,N,Y,Y
ALIGHT_AT_NEXT_START_LOCATION,N,Y,N,Y
N_JOURNEYS,85417,20284,83623,964
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.0, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_0 AS INT) * 3.3)",29.7,0.0,29.7,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.2, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_20 AS INT) * 3.3)",191.4,0.0,2277.0,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.4, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_40 AS INT) * 3.3)",1250.7,0.0,5695.8,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.6, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_60 AS INT) * 3.3)",6180.9,0.0,16952.1,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.8, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_80 AS INT) * 3.3)",23007.6,0.0,33815.1,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 1.0, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_100 AS INT) * 3.3)",179335.2,0.0,143391.6,0.0
PERCENT_JOURNEYS,44.888275,10.659632,43.945492,0.5066


In [9]:
single_leg_fixed_journeys_analysis.toPandas().sort_values(
    by = ["IS_SINGLE_STOP_RIDE","ALIGHT_AT_NEXT_START_LOCATION"]
    ).T

                                                                                

Unnamed: 0,3,2,1,0
IS_SINGLE_STOP_RIDE,N,N,Y,Y
ALIGHT_AT_NEXT_START_LOCATION,N,Y,N,Y
N_JOURNEYS,128297,17366,69050,1623
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.0, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_0 AS INT) * 3.3)",29.7,0.0,29.7,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.2, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_20 AS INT) * 3.3)",184.8,0.0,2564.1,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.4, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_40 AS INT) * 3.3)",445.5,0.0,6745.2,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.6, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_60 AS INT) * 3.3)",2993.1,0.0,18374.4,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 0.8, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_80 AS INT) * 3.3)",17760.6,0.0,33950.4,0.0
"(CAST(percentile_approx(ALIGHTING_DISTANCE_TO_NEXT_STOP, 1.0, 10000) AS PERCENTILE_DISTANCE_FROM_ALIGHTING_TO_NEXT_ORIGIN_100 AS INT) * 3.3)",174985.8,0.0,146582.7,0.0
PERCENT_JOURNEYS,59.304508,8.027328,31.917942,0.750222


In [17]:
check_fixed = single_leg_fixed_journeys.filter(
    F.col("IS_SINGLE_STOP_RIDE") == F.lit("Y")
    ).groupBy("LINE_ID").agg(F.count("*").alias("count"),F.mean(F.col("CONFIDENCE_NEXT"))).toPandas()
check_fixed["percentage"] = check_fixed["count"]/check_fixed["count"].sum()*100
view_fixed = check_fixed[check_fixed["percentage"] > 0.01].sort_values(by = ["LINE_ID"],ascending = False)

                                                                                

In [7]:
check_control = single_leg_control_journeys.filter(
    F.col("IS_SINGLE_STOP_RIDE") == F.lit("Y")
    ).groupBy("LINE_ID").agg(F.count("*").alias("count"),F.mean(F.col("CONFIDENCE_NEXT"))).toPandas()
check_control["percentage"] = check_control["count"]/check_control["count"].sum()*100
view_control = check_control[check_control["percentage"] > 0.01].sort_values(by = ["LINE_ID"],ascending = False)

                                                                                

In [8]:
compare_lines = view_control.merge(view_fixed,on = ["LINE_ID"],how = "outer",suffixes = ("_control","_fixed"))
compare_lines[sorted(compare_lines.columns)]

Unnamed: 0,LINE_ID,avg(CONFIDENCE_NEXT)_control,avg(CONFIDENCE_NEXT)_fixed,count_control,count_fixed,percentage_control,percentage_fixed
0,290,0.456426,0.48846,8399,6034,9.929422,8.537914
1,200,0.437808,0.529314,8269,11242,9.775734,15.907065
2,190,0.442411,0.545696,7663,5515,9.059312,7.803546
3,100,0.470434,0.58217,36123,35249,42.705144,49.87619
4,90,0.445752,0.53964,24133,12633,28.530389,17.875285


In [9]:
single_leg_control_journeys.groupBy("IS_SINGLE_STOP_RIDE","IS_MAX").agg(
    F.count("*").alias("count"),
    F.mean(F.col("CONFIDENCE_NEXT")),
    F.stddev(F.col("CONFIDENCE_NEXT")),
    F.mean(F.col("ALIGHTING_DISTANCE_TO_NEXT_STOP")),
    F.stddev(F.col("ALIGHTING_DISTANCE_TO_NEXT_STOP"))).toPandas()

                                                                                

Unnamed: 0,IS_SINGLE_STOP_RIDE,IS_MAX,count,avg(CONFIDENCE_NEXT),stddev_samp(CONFIDENCE_NEXT),avg(ALIGHTING_DISTANCE_TO_NEXT_STOP),stddev_samp(ALIGHTING_DISTANCE_TO_NEXT_STOP)
0,Y,Y,84587,0.456273,0.105544,5458.14698,5949.900136
1,N,Y,105701,0.404082,0.184955,2872.698331,5134.213634


In [10]:
single_leg_fixed_journeys = single_leg_fixed_journeys.withColumn(
    "IS_MAX",
    F.when(F.col("LINE_ID").isin([200,190,290,90,100]),"Y").otherwise("N")
    )
single_leg_fixed_journeys.groupBy("IS_SINGLE_STOP_RIDE","IS_MAX").agg(
    F.count("*").alias("count"),
    F.mean(F.col("CONFIDENCE_NEXT")),
    F.stddev(F.col("CONFIDENCE_NEXT")),
    F.mean(F.col("ALIGHTING_DISTANCE_TO_NEXT_STOP")),
    F.stddev(F.col("ALIGHTING_DISTANCE_TO_NEXT_STOP"))).toPandas()

                                                                                

Unnamed: 0,IS_SINGLE_STOP_RIDE,IS_MAX,count,avg(CONFIDENCE_NEXT),stddev_samp(CONFIDENCE_NEXT),avg(ALIGHTING_DISTANCE_TO_NEXT_STOP),stddev_samp(ALIGHTING_DISTANCE_TO_NEXT_STOP)
0,Y,Y,70673,0.555313,0.258876,5603.369794,5932.335283
1,N,Y,145663,0.457442,0.196698,2451.779079,4605.78044


In [11]:
single_leg_control_journeys.filter(F.col("IS_MAX") == "Y").groupBy("IS_SINGLE_STOP_RIDE").agg(
    F.count("*"),
    *[F.percentile_approx(F.col("CONFIDENCE_NEXT"),p/100) for p in range(0,120,20)]
    ).toPandas().T.sort_values(by = ["IS_SINGLE_STOP_RIDE"],axis = 1)

                                                                                

Unnamed: 0,1,0
IS_SINGLE_STOP_RIDE,N,Y
count(1),105701,84587
"percentile_approx(CONFIDENCE_NEXT, 0.0, 10000)",0.0,0.0
"percentile_approx(CONFIDENCE_NEXT, 0.2, 10000)",0.273197,0.38382
"percentile_approx(CONFIDENCE_NEXT, 0.4, 10000)",0.390432,0.426805
"percentile_approx(CONFIDENCE_NEXT, 0.6, 10000)",0.454424,0.469992
"percentile_approx(CONFIDENCE_NEXT, 0.8, 10000)",0.529993,0.524687
"percentile_approx(CONFIDENCE_NEXT, 1.0, 10000)",1.0,1.0


In [10]:
single_leg_fixed_journeys.filter(F.col("IS_MAX") == "Y").groupBy("IS_SINGLE_STOP_RIDE").agg(F.count("*"),
    *[F.percentile_approx(F.col("CONFIDENCE_NEXT"),p/100) for p in range(0,120,20)]
    ).toPandas().T.sort_values(by = ["IS_SINGLE_STOP_RIDE"],axis = 1)

                                                                                

Unnamed: 0,1,0
IS_SINGLE_STOP_RIDE,N,Y
count(1),145663,70673
"percentile_approx(CONFIDENCE_NEXT, 0.0, 10000)",0.0,0.0
"percentile_approx(CONFIDENCE_NEXT, 0.2, 10000)",0.284639,0.294804
"percentile_approx(CONFIDENCE_NEXT, 0.4, 10000)",0.398213,0.441488
"percentile_approx(CONFIDENCE_NEXT, 0.6, 10000)",0.492754,0.68875
"percentile_approx(CONFIDENCE_NEXT, 0.8, 10000)",0.61539,0.806693
"percentile_approx(CONFIDENCE_NEXT, 1.0, 10000)",1.0,1.0


In [11]:
check_control_n_lines = single_leg_control_journeys.join(n_lines_per_stop,on = ["STOP_ID"]).groupBy("N_MAX_LINES").agg(
    F.count("*").alias("count"),
    *get_percentile_columns(F.col("CONFIDENCE_NEXT"),"CONFIDENCE",20)
    ).toPandas()
check_fixed_n_lines = single_leg_fixed_journeys.join(n_lines_per_stop,on = ["STOP_ID"]).groupBy("N_MAX_LINES").agg(
    F.count("*").alias("count"),
    *get_percentile_columns(F.col("CONFIDENCE_NEXT"),"CONFIDENCE",20)
    ).toPandas()


                                                                                

In [12]:
check_control_n_lines.merge(check_fixed_n_lines,on = "N_MAX_LINES",suffixes=("_CONTROL","_FIXED")).sort_values(by = ["N_MAX_LINES"]).T

Unnamed: 0,0,2,1
N_MAX_LINES,1.0,2.0,3.0
count_CONTROL,82556.0,65393.0,42339.0
PERCENTILE_0_CONFIDENCE_CONTROL,0.0,0.0,0.0
PERCENTILE_20_CONFIDENCE_CONTROL,0.38003,0.318995,0.3422425
PERCENTILE_40_CONFIDENCE_CONTROL,0.443108,0.386155,0.3977404
PERCENTILE_60_CONFIDENCE_CONTROL,0.489913,0.431887,0.4439668
PERCENTILE_80_CONFIDENCE_CONTROL,0.550407,0.496822,0.5138611
PERCENTILE_100_CONFIDENCE_CONTROL,1.0,1.0,1.0
count_FIXED,96296.0,74668.0,45372.0
PERCENTILE_0_CONFIDENCE_FIXED,0.0,0.0,8.247397000000001e-17


In [13]:
single_leg_control_journeys.filter(
    F.col("IS_MAX")=="Y").select("JOURNEY_ID").distinct().count(),single_leg_fixed_journeys.filter(F.col("IS_MAX")=="Y").select("JOURNEY_ID").distinct().count()

                                                                                

(171688, 216336)

In [44]:
check_counts = fixed_journeys.filter(F.col("EVENT").isin(["BOARDED","ALIGHTED"])).groupBy("CARD_ID","DATETIME","JOURNEY_ID").count()
check_counts.filter(F.col("COUNT")>1).show()
fixed_journeys.filter(F.col("JOURNEY_ID").contains("a092bdeffa055f8b0")).show()

+--------------------+-------------------+--------------------+-----+
|             CARD_ID|           DATETIME|          JOURNEY_ID|count|
+--------------------+-------------------+--------------------+-----+
|c724fccd-3182-fa1...|2024-04-09 07:52:47|a092bdeffa055f8b0...|    2|
|d0455611-166e-0e6...|2024-04-14 10:01:10|ea308a398ac2a2573...|    2|
|ec056711-23e8-cae...|2024-04-08 04:18:27|8028515841b15addd...|    2|
|78dfad0b-b622-26b...|2024-04-27 00:11:12|700c80e3a4ec6caf2...|    2|
|8381ba2f-e6ca-d8d...|2024-04-04 07:06:54|0cf0d0acab3be436c...|    2|
|8dd86836-2b21-66f...|2024-04-10 03:15:36|5a525ea4a23e98c43...|    2|
|ac3a8e5e-e865-c61...|2024-04-23 03:49:42|95715b5c9bdf2b372...|    2|
|baf5f42b-a4e8-325...|2024-04-08 11:33:37|e576eddbdfc3292ce...|    2|
|bc722a87-f7b6-c64...|2024-04-06 02:59:12|b69f124a048b54f8f...|    2|
|bdcf45b4-1a3b-9c5...|2024-04-02 05:37:38|a73503b08f7d006aa...|    2|
|dd9ba4e3-15b9-da5...|2024-04-09 11:36:28|9a97485b15e5c59f4...|    2|
|1e04bb13-612f-13e..

In [45]:
single_leg_fixed_journeys.filter(F.col("STOP_ID") == F.col("STOP_ID_NEXT")).show()

+-------+-------+--------------------+-------------------------+-------------------+--------------------+---------+-----------+------------+-------+----------+----------+-------+------------------+-----------------+------------+------+---------------+--------------------+------------+-------------+-------------+-----------------+------------------+------------------+-------------------+------------------+---------------------+--------------------------+-------------------------------+-------------------+---------------+-------------------+-----------------------------+
|LINE_ID|STOP_ID|          JOURNEY_ID|FARE_CATEGORY_DESCRIPTION|           DATETIME|             CARD_ID| STOP_LAT|   STOP_LON|DIRECTION_ID|  EVENT|CONFIDENCE|EVENT_TYPE|IS_LOOP|JOURNEY_START_DATE|TRIP_STOP_ID_NEXT|DIRECTION_ID|IS_MAX|EVENT_NEXT_NEXT|EVENT_TYPE_NEXT_NEXT|STOP_ID_NEXT|STOP_LAT_NEXT|STOP_LON_NEXT|STOP_ID_NEXT_NEXT|STOP_LAT_NEXT_NEXT|STOP_LON_NEXT_NEXT| DATETIME_NEXT_NEXT|   CONFIDENCE_NEXT|DISTANCE_TO_NEXT

In [15]:
fixed_journeys = spark.read.parquet("../data/03_primary/rider_events_partitioned")
fixed_journeys = fixed_journeys.withColumn(
    "IS_MAX",
    F.when(F.col("LINE_ID").isin([200,190,290,90,100]),"Y").otherwise("N")
    ).cache()

In [30]:
window = Window.partitionBy(F.col("JOURNEY_ID")).orderBy(F.col("DATETIME").asc())
fixed_journeys = fixed_journeys.withColumn(
    "EVENT_TYPE_NEXT",
    F.lead(F.col("EVENT_TYPE"),1).over(window)
    ).withColumn(
    "STOP_ID_NEXT",
    F.lead(F.col("STOP_ID"),1).over(window)
    )
single_legged_journeys = fixed_journeys.filter(
    F.col("EVENT_TYPE")=="ORIGIN"
    ).filter(
    F.col("EVENT_TYPE_NEXT")=="DESTINATION"
)
single_legged_journeys = single_legged_journeys.join(stop_times.drop("IS_MAX"),on = ["LINE_ID","STOP_ID","DIRECTION_ID"],how = "left")

single_legged_journeys.filter(F.col("IS_MAX")=="Y").filter(F.col("STOP_ID_NEXT") == F.col("TRIP_STOP_ID_NEXT")).show()

                                                                                

+-------+-------+------------+--------------------+-------------------------+-------------------+--------------------+---------+-----------+-------+----------+----------+-------+------------------+------+---------------+------------+-----------------+
|LINE_ID|STOP_ID|DIRECTION_ID|          JOURNEY_ID|FARE_CATEGORY_DESCRIPTION|           DATETIME|             CARD_ID| STOP_LAT|   STOP_LON|  EVENT|CONFIDENCE|EVENT_TYPE|IS_LOOP|JOURNEY_START_DATE|IS_MAX|EVENT_TYPE_NEXT|STOP_ID_NEXT|TRIP_STOP_ID_NEXT|
+-------+-------+------------+--------------------+-------------------------+-------------------+--------------------+---------+-----------+-------+----------+----------+-------+------------------+------+---------------+------------+-----------------+
|    290|  13729|           0|029b2060a56c08488...|                    Adult|2024-04-07 01:33:23|95b7673d-5322-19b...|45.507942|-122.680858|BOARDED|       1.0|    ORIGIN|  false|        2024-04-07|     Y|    DESTINATION|       10293|           

In [36]:
single_legged_journeys.filter(F.col("IS_MAX")=="Y").show()

                                                                                

+-------+-------+------------+--------------------+-------------------------+-------------------+--------------------+---------+-----------+-------+----------+----------+-------+------------------+------+---------------+------------+-----------------+
|LINE_ID|STOP_ID|DIRECTION_ID|          JOURNEY_ID|FARE_CATEGORY_DESCRIPTION|           DATETIME|             CARD_ID| STOP_LAT|   STOP_LON|  EVENT|CONFIDENCE|EVENT_TYPE|IS_LOOP|JOURNEY_START_DATE|IS_MAX|EVENT_TYPE_NEXT|STOP_ID_NEXT|TRIP_STOP_ID_NEXT|
+-------+-------+------------+--------------------+-------------------------+-------------------+--------------------+---------+-----------+-------+----------+----------+-------+------------------+------+---------------+------------+-----------------+
|    100|   8374|           1|000010ad81e49c188...|          Honored Citizen|2024-04-06 09:56:46|9b028307-959b-fec...|45.530146|-122.654335|BOARDED|       1.0|    ORIGIN|  false|        2024-04-06|     Y|    DESTINATION|        8378|           

In [47]:
check = fixed_journeys.filter(
    F.col("CARD_ID").contains("c724fccd-3182-fa1")
    ).select("DATETIME","EVENT","STOP_ID","LINE_ID").withColumn("DATETIME",F.col("DATETIME").cast("string")).toPandas()
check[check["DATETIME"].str.contains("2024-04-09")].sort_values(by = ["DATETIME"])

                                                                                

Unnamed: 0,DATETIME,EVENT,STOP_ID,LINE_ID
61,2024-04-09 00:21:13,ALIGHTED,3452,75
19,2024-04-09 00:21:29,ALIGHTED,3452,75
81,2024-04-09 00:21:29,ALIGHTED,3452,75
75,2024-04-09 00:21:29,ALIGHTED,3452,75
20,2024-04-09 07:49:13,BOARDED,3453,75
62,2024-04-09 07:49:13,BOARDED,3453,75
82,2024-04-09 07:49:13,BOARDED,3453,75
76,2024-04-09 07:49:13,BOARDED,3453,75
83,2024-04-09 07:52:47,BOARDED,3558,75
21,2024-04-09 07:52:47,BOARDED,3558,75
