<div class="alert alert-block alert-info">
Parquet Statistics

<small>
Version 1.3<br>
Created: 09.09.2024  <br>
William Siegle, Daimler Truck AG

In [12]:
%reset -f

In [13]:
# Specify Data Location
parquet_folder = '/home/sieglew/data/processed'

In [14]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq 
import pickle

In [15]:
def trip_data_info(relative_path):

    from os import listdir, getcwd
    from os.path import isfile, join, exists

    # data base directory path:
    data_dir = relative_path
    cwd = getcwd()

    if exists(join(cwd,data_dir)):
        data_path = join(cwd,data_dir)
    else:
        print("Directory '",data_dir,"' not found!")
        print("Files and directories in '", cwd, "' :") 
        print(listdir(cwd))
        quit()

    # create list of all parquet files:
    files_list = [f for f in listdir(data_path) if (isfile(join(data_path, f)) and f.endswith(".parquet"))]

    id_num_list,V_list = ([],[])
    trips  = {}

    for f in files_list:
        f = f.strip("v_.parquet")
        x = f.split("_",1)

        id = x[0].split("V")[0].strip("id")
        id_num_list.append(id)

        V = "V" + x[0].split("V")[1]
        V_list.append(V)

        trip = x[1]
        if V not in trips.keys():
            trips[V]= []
        trips[V].append(trip)

    vehicles = set(V_list)
    ids = set(id_num_list)

    trip_counts = trips.copy()

    for V in trip_counts.keys():
        trip_counts[V] = len(trips[V])

    # Output results:
    print("Volts Database Status:")
    print("-"*50)
    print("Directory:",data_path)
    print("Files:",len(files_list),"parquet files found.")
    print("Unique id values: ", ids)
    print("Total number of vehicles: ", len(vehicles))
    print("Total number of complete trips: ", sum(trip_counts.values()))
    print("-"*50)
    print("Trips per vehicle:")
    for V in trip_counts.keys():
        print("     ",V,": ",trip_counts[V], "complete trips")
    print("-"*50)
    
    return files_list, trips

In [16]:
all_files, trip_by_vehicle = trip_data_info(parquet_folder)

Volts Database Status:
--------------------------------------------------
Directory: /home/sieglew/data/processed
Files: 3197 parquet files found.
Unique id values:  {'983'}
Total number of vehicles:  15
Total number of complete trips:  3197
--------------------------------------------------
Trips per vehicle:
      V17 :  276 complete trips
      V14 :  578 complete trips
      V19 :  158 complete trips
      V13 :  310 complete trips
      V18 :  303 complete trips
      V15 :  199 complete trips
      V101 :  222 complete trips
      V1 :  183 complete trips
      V4 :  186 complete trips
      V12 :  262 complete trips
      V16 :  333 complete trips
      V11 :  59 complete trips
      V2 :  57 complete trips
      V10 :  68 complete trips
      V102 :  3 complete trips
--------------------------------------------------


In [17]:
data_dir = parquet_folder
cwd = os.getcwd()

if os.path.exists(os.path.join(cwd,data_dir)):
    data_path = os.path.join(cwd,data_dir)

# Get the shape of the DataFrame (rows, columns)
trip_size = []
for f in all_files:
    trip_rows = pq.read_metadata(os.path.join(data_path,f)).num_rows
    trip_size.append(trip_rows)

    #if pq.read_metadata(os.path.join(data_path,f)).num_columns != 114:
    #    print(f)

trips_sizes = pd.DataFrame(trip_size, all_files)

trips_sizes.columns = ['trip_size']


In [18]:
# read through all trips and extract SOC-difference:
trip_soc = []
for f in all_files:
    soc = pd.read_parquet(parquet_folder + "/" + f, engine='fastparquet', columns = ["hv_bat_soc_cval_bms1"])
    d_soc = soc.iloc[soc.last_valid_index()] - soc.iloc[soc.first_valid_index()]    # considering first and last non-NaN value only!
    trip_soc.append(d_soc.values)

all_trips_soc = pd.DataFrame(trip_soc, all_files)
all_trips_soc.insert(1,"trip_size", trips_sizes)
all_trips_soc.columns = ["soc_diff", "trip_size"]

trips_sizes = trips_sizes.sort_values(by=['trip_size'])

In [19]:
# check if SOC-List is complete:
if all_trips_soc.isnull().values.any():
    print('SOC calculation failed for theses files:')
    all_trips_soc.iloc[np.where(all_trips_soc.isnull().values)]

In [20]:
with open('Volts.pickle', 'wb') as handle:
    pickle.dump([all_files, all_trips_soc, trips_sizes, trip_by_vehicle], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:

with open('/home/sieglew/data/Trips_processed/V19_trip43.pickle', 'wb') as handle:
    pickle.dump([all_files, all_trips_soc, trips_sizes, trip_by_vehicle], handle, protocol=pickle.HIGHEST_PROTOCOL)