In [15]:
# Importing dependencies
import pandas as pd
import numpy as np

# Loading the driver stats dataset into a DataFrame
data = pd.read_parquet(path="driver_stats/data/driver_stats_with_string.parquet")

# Sorting the rows by event timestamps, from oldest to newest
data = data.sort_values(by="event_timestamp")

In [16]:
# Getting the oldest and newest event timestamps from the data
print(f"Earliest event timestamp is {data['event_timestamp'].min()}")
print(f"Latest event timestamp is {data['event_timestamp'].max()}")

Earliest event timestamp is 2021-04-12 07:00:00+00:00
Latest event timestamp is 2021-09-15 17:00:00+00:00


In [17]:
# Inspecting the first 25 feature rows
data.head(25)

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature
1082,2021-04-12 07:00:00+00:00,1003,0.186658,0.24549,971,2021-09-15 18:01:55.403,test
721,2021-04-12 07:00:00+00:00,1004,0.891017,0.118256,154,2021-09-15 18:01:55.403,test
1443,2021-04-12 07:00:00+00:00,1002,0.775499,0.947109,890,2021-09-15 18:01:55.403,test
360,2021-04-12 07:00:00+00:00,1005,0.138263,0.95552,553,2021-09-15 18:01:55.403,test
1804,2021-04-12 07:00:00+00:00,1001,0.701558,0.195824,566,2021-09-15 18:01:55.403,test
722,2021-08-31 18:00:00+00:00,1003,0.494782,0.000316,634,2021-09-15 18:01:55.403,test
1444,2021-08-31 18:00:00+00:00,1001,0.910018,0.580611,666,2021-09-15 18:01:55.403,test
1083,2021-08-31 18:00:00+00:00,1002,0.381206,0.347303,268,2021-09-15 18:01:55.403,test
361,2021-08-31 18:00:00+00:00,1004,0.527224,0.89334,963,2021-09-15 18:01:55.403,test
0,2021-08-31 18:00:00+00:00,1005,0.362754,0.697629,31,2021-09-15 18:01:55.403,test


In [18]:
# Inspecting the last 25 feature rows
data.tail(25)

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature
1438,2021-09-15 13:00:00+00:00,1002,0.187022,0.636303,698,2021-09-15 18:01:55.403,test
1799,2021-09-15 13:00:00+00:00,1001,0.217096,0.16114,800,2021-09-15 18:01:55.403,test
355,2021-09-15 13:00:00+00:00,1005,0.109662,0.948472,863,2021-09-15 18:01:55.403,test
1077,2021-09-15 13:00:00+00:00,1003,0.874185,0.446985,255,2021-09-15 18:01:55.403,test
716,2021-09-15 13:00:00+00:00,1004,0.692943,0.251203,289,2021-09-15 18:01:55.403,test
1078,2021-09-15 14:00:00+00:00,1003,0.580215,0.259824,379,2021-09-15 18:01:55.403,test
717,2021-09-15 14:00:00+00:00,1004,0.285921,0.871101,582,2021-09-15 18:01:55.403,test
1439,2021-09-15 14:00:00+00:00,1002,0.481214,0.767431,55,2021-09-15 18:01:55.403,test
1800,2021-09-15 14:00:00+00:00,1001,0.778216,0.06967,151,2021-09-15 18:01:55.403,test
356,2021-09-15 14:00:00+00:00,1005,0.287409,0.477578,354,2021-09-15 18:01:55.403,test


In [19]:
# Getting the list of unique driver IDs
print(f"Unique driver IDs are {np.array2string(data['driver_id'].unique())}")

Unique driver IDs are [1003 1004 1002 1005 1001]


In [20]:
df2 = data
df2.sort_values(["driver_id", "event_timestamp"], inplace=True)
df2["diffs"] = df2.groupby(["driver_id"])["event_timestamp"].transform(lambda x: x.diff())

In [21]:
df2["diffs"].head()

1804                 NaT
1444   141 days 11:00:00
1445     0 days 01:00:00
1446     0 days 01:00:00
1447     0 days 01:00:00
Name: diffs, dtype: timedelta64[ns]

In [22]:
# Checking which indices the time difference is different than one hour
for i in range(len(df2["diffs"])):
    if df2.iloc[i]["diffs"] != pd.Timedelta("0 days 01:00:00"):
        print(f"Timedelta is not one hour for index {i} and its previous index")

Timedelta is not one hour for index 0 and its previous index
Timedelta is not one hour for index 1 and its previous index
Timedelta is not one hour for index 361 and its previous index
Timedelta is not one hour for index 362 and its previous index
Timedelta is not one hour for index 722 and its previous index
Timedelta is not one hour for index 723 and its previous index
Timedelta is not one hour for index 904 and its previous index
Timedelta is not one hour for index 905 and its previous index
Timedelta is not one hour for index 1085 and its previous index
Timedelta is not one hour for index 1086 and its previous index
Timedelta is not one hour for index 1446 and its previous index
Timedelta is not one hour for index 1447 and its previous index


In [23]:
# Difference is not one hour at index 0 because there are no indices before 0
# Difference is not one hour at index 1 because timestamps jump from 2021-04-12 to 2021-08-31
df2[:2]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
1804,2021-04-12 07:00:00+00:00,1001,0.701558,0.195824,566,2021-09-15 18:01:55.403,test,NaT
1444,2021-08-31 18:00:00+00:00,1001,0.910018,0.580611,666,2021-09-15 18:01:55.403,test,141 days 11:00:00


In [24]:
# Difference is not one hour at index 361 because previous index is for a different ID
# Difference is not one hour at index 362 because timestamps jump from 2021-04-12 to 2021-08-31
df2[360:363]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
1803,2021-09-15 17:00:00+00:00,1001,0.812357,0.840873,714,2021-09-15 18:01:55.403,test,0 days 01:00:00
1443,2021-04-12 07:00:00+00:00,1002,0.775499,0.947109,890,2021-09-15 18:01:55.403,test,NaT
1083,2021-08-31 18:00:00+00:00,1002,0.381206,0.347303,268,2021-09-15 18:01:55.403,test,141 days 11:00:00


In [25]:
# Difference is not one hour at index 722 because previous index is for a different ID
# Difference is not one hour at index 723 because timestamps jump from 2021-04-12 to 2021-08-31
df2[721:724]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
1442,2021-09-15 17:00:00+00:00,1002,0.379485,0.151377,315,2021-09-15 18:01:55.403,test,0 days 01:00:00
1082,2021-04-12 07:00:00+00:00,1003,0.186658,0.24549,971,2021-09-15 18:01:55.403,test,NaT
722,2021-08-31 18:00:00+00:00,1003,0.494782,0.000316,634,2021-09-15 18:01:55.403,test,141 days 11:00:00


In [26]:
# Difference is not one hour at index 904 and 905 because they are duplicates of index 903
df2[903:906]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
1806,2021-09-08 06:00:00+00:00,1003,0.084715,0.615489,523,2021-09-15 18:01:55.403,test,0 days 01:00:00
1805,2021-09-08 06:00:00+00:00,1003,0.084715,0.615489,523,2021-09-15 18:01:55.403,test,0 days 00:00:00
902,2021-09-08 06:00:00+00:00,1003,0.084715,0.615489,523,2021-09-15 18:01:55.403,test,0 days 00:00:00


In [27]:
# Difference is not one hour at index 1085 because previous index is for a different ID
# Difference is not one hour at index 1086 because timestamps jump from 2021-04-12 to 2021-08-31
df2[1084:1087]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
1081,2021-09-15 17:00:00+00:00,1003,0.072028,0.032925,41,2021-09-15 18:01:55.403,test,0 days 01:00:00
721,2021-04-12 07:00:00+00:00,1004,0.891017,0.118256,154,2021-09-15 18:01:55.403,test,NaT
361,2021-08-31 18:00:00+00:00,1004,0.527224,0.89334,963,2021-09-15 18:01:55.403,test,141 days 11:00:00


In [28]:
# Difference is not one hour at index 1446 because previous index is for a different ID
# Difference is not one hour at index 1447 because timestamps jump from 2021-04-12 to 2021-08-31
df2.iloc[1445:1448]

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature,diffs
720,2021-09-15 17:00:00+00:00,1004,0.541801,0.595369,539,2021-09-15 18:01:55.403,test,0 days 01:00:00
360,2021-04-12 07:00:00+00:00,1005,0.138263,0.95552,553,2021-09-15 18:01:55.403,test,NaT
0,2021-08-31 18:00:00+00:00,1005,0.362754,0.697629,31,2021-09-15 18:01:55.403,test,141 days 11:00:00
