In [14]:
# Importing dependencies
import pandas as pd
import numpy as np

# Loading the driver stats dataset into a DataFrame
data = pd.read_parquet(path="driver_stats_with_string.parquet")

# Sorting the rows by event timestamps, from oldest to newest
data = data.sort_values(by="event_timestamp")

In [3]:
# Inspecting the dataset's first 25 rows
data.head(25)

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature
1082,2021-04-12 07:00:00+00:00,1003,0.186658,0.24549,971,2021-09-15 18:01:55.403,test
721,2021-04-12 07:00:00+00:00,1004,0.891017,0.118256,154,2021-09-15 18:01:55.403,test
1443,2021-04-12 07:00:00+00:00,1002,0.775499,0.947109,890,2021-09-15 18:01:55.403,test
360,2021-04-12 07:00:00+00:00,1005,0.138263,0.95552,553,2021-09-15 18:01:55.403,test
1804,2021-04-12 07:00:00+00:00,1001,0.701558,0.195824,566,2021-09-15 18:01:55.403,test
722,2021-08-31 18:00:00+00:00,1003,0.494782,0.000316,634,2021-09-15 18:01:55.403,test
1444,2021-08-31 18:00:00+00:00,1001,0.910018,0.580611,666,2021-09-15 18:01:55.403,test
1083,2021-08-31 18:00:00+00:00,1002,0.381206,0.347303,268,2021-09-15 18:01:55.403,test
361,2021-08-31 18:00:00+00:00,1004,0.527224,0.89334,963,2021-09-15 18:01:55.403,test
0,2021-08-31 18:00:00+00:00,1005,0.362754,0.697629,31,2021-09-15 18:01:55.403,test


In [17]:
# Dropping the feature rows for 2021-04-12, 2021-08-31, and after 2021-09-15 00:00:00+00:00
data = data.iloc[35:-85]

In [18]:
# Creating an "entity DataFrame" with timestamps, the driver IDs, and the string feature column
entity_df = data[['event_timestamp', 'driver_id', 'created', 'string_feature']]

In [19]:
# Inspecting the entity DF
entity_df

Unnamed: 0,event_timestamp,driver_id,created,string_feature
1450,2021-09-01 00:00:00+00:00,1001,2021-09-15 18:01:55.403,test
367,2021-09-01 00:00:00+00:00,1004,2021-09-15 18:01:55.403,test
728,2021-09-01 00:00:00+00:00,1003,2021-09-15 18:01:55.403,test
6,2021-09-01 00:00:00+00:00,1005,2021-09-15 18:01:55.403,test
1089,2021-09-01 00:00:00+00:00,1002,2021-09-15 18:01:55.403,test
...,...,...,...,...
1064,2021-09-15 00:00:00+00:00,1003,2021-09-15 18:01:55.403,test
342,2021-09-15 00:00:00+00:00,1005,2021-09-15 18:01:55.403,test
1786,2021-09-15 00:00:00+00:00,1001,2021-09-15 18:01:55.403,test
703,2021-09-15 00:00:00+00:00,1004,2021-09-15 18:01:55.403,test


In [30]:
# Creating the first subset of the dataset
data_df1 = data[['conv_rate']]

In [31]:
# Inspecting the first subset
data_df1

Unnamed: 0,conv_rate
722,0.494782
1444,0.910018
1083,0.381206
361,0.527224
0,0.362754
...,...
720,0.541801
359,0.318631
1442,0.379485
1803,0.812357


In [32]:
# Creating the second subset of the dataset
data_df2 = data[['acc_rate', 'avg_daily_trips']]

In [33]:
# Inspecting the second subset
data_df2

Unnamed: 0,acc_rate,avg_daily_trips
722,0.000316,634
1444,0.580611,666
1083,0.347303,268
361,0.893340,963
0,0.697629,31
...,...,...
720,0.595369,539
359,0.527131,189
1442,0.151377,315
1803,0.840873,714


In [34]:
# Adding the entity DF to each subset DF
data_df1 = pd.concat(objs=[data_df1, entity_df], axis=1)
data_df2 = pd.concat(objs=[data_df2, entity_df], axis=1)

In [35]:
# Inspecting the first subset
data_df1

Unnamed: 0,conv_rate,event_timestamp,driver_id,created,string_feature
722,0.494782,2021-08-31 18:00:00+00:00,1003,2021-09-15 18:01:55.403,test
1444,0.910018,2021-08-31 18:00:00+00:00,1001,2021-09-15 18:01:55.403,test
1083,0.381206,2021-08-31 18:00:00+00:00,1002,2021-09-15 18:01:55.403,test
361,0.527224,2021-08-31 18:00:00+00:00,1004,2021-09-15 18:01:55.403,test
0,0.362754,2021-08-31 18:00:00+00:00,1005,2021-09-15 18:01:55.403,test
...,...,...,...,...,...
720,0.541801,2021-09-15 17:00:00+00:00,1004,2021-09-15 18:01:55.403,test
359,0.318631,2021-09-15 17:00:00+00:00,1005,2021-09-15 18:01:55.403,test
1442,0.379485,2021-09-15 17:00:00+00:00,1002,2021-09-15 18:01:55.403,test
1803,0.812357,2021-09-15 17:00:00+00:00,1001,2021-09-15 18:01:55.403,test


In [36]:
# Inspecting the second subset
data_df2

Unnamed: 0,acc_rate,avg_daily_trips,event_timestamp,driver_id,created,string_feature
722,0.000316,634,2021-08-31 18:00:00+00:00,1003,2021-09-15 18:01:55.403,test
1444,0.580611,666,2021-08-31 18:00:00+00:00,1001,2021-09-15 18:01:55.403,test
1083,0.347303,268,2021-08-31 18:00:00+00:00,1002,2021-09-15 18:01:55.403,test
361,0.893340,963,2021-08-31 18:00:00+00:00,1004,2021-09-15 18:01:55.403,test
0,0.697629,31,2021-08-31 18:00:00+00:00,1005,2021-09-15 18:01:55.403,test
...,...,...,...,...,...,...
720,0.595369,539,2021-09-15 17:00:00+00:00,1004,2021-09-15 18:01:55.403,test
359,0.527131,189,2021-09-15 17:00:00+00:00,1005,2021-09-15 18:01:55.403,test
1442,0.151377,315,2021-09-15 17:00:00+00:00,1002,2021-09-15 18:01:55.403,test
1803,0.840873,714,2021-09-15 17:00:00+00:00,1001,2021-09-15 18:01:55.403,test


In [37]:
data_df1.to_parquet("/driver_stats/data/driver_stats_1.parquet")
data_df2.to_parquet("/driver_stats/data/driver_stats_2.parquet")