<div class="alert alert-block alert-info">
Parquet Statistics

<small>
Version 1.3<br>
Created: 09.09.2024  <br>
William Siegle, Daimler Truck AG

In [2]:
%reset -f

In [3]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pickle
from datetime import datetime
from pathlib import Path
from scipy.signal import savgol_filter
from scipy.stats import zscore, median_abs_deviation
import pyarrow.parquet as pq 

from IPython.core.magic import register_cell_magic
@register_cell_magic
def skip(line, cell):   # cells can be skipped by using '%%skip' in the first line
    return

In [10]:
# ------------ LOCATE REPOSITORY/DATASTORAGE IN CURRENT SYSTEM ENVIRONMENT  --------------
import sys, os; from pathlib import Path                                                #|
global ROOT, DATA_PATH, IS_NOTEBOOK; IS_NOTEBOOK = True                                 #|
ROOT = Path('..').resolve() if IS_NOTEBOOK else Path('.').resolve()                      #|
sys.path.append(os.path.abspath(ROOT))                                                  #|
from data import get_data_path  # paths set in "data/__init__.py"                       #|
DATA_PATH = get_data_path()                                                             #|
print(f"{'-'*60}\n{DATA_PATH}:\t{', '.join([_.name for _ in DATA_PATH.glob('*/')])}") #|
print(f"{ROOT}:\t{', '.join([_.name for _ in ROOT.glob('*/')])}")   	                #|
# ----------------------------------------------------------------------------------------

------------------------------------------------------------
C:\Users\SIEGLEW\OneDrive - Daimler Truck\MA\Code\MA-Data:	processed, trips_processed_final, trips_processed_pickles, trips_processed_resampled, y_true
C:\Users\SIEGLEW\OneDrive - Daimler Truck\MA\Code\MA-eR-PINN:	.git, archive, data, project, ref, src, test


In [12]:
def trip_data_info(relative_path):

    from os import listdir, getcwd
    from os.path import isfile, join, exists

    # data base directory path:
    data_dir = relative_path
    cwd = getcwd()

    if exists(join(cwd,data_dir)):
        data_path = join(cwd,data_dir)
    else:
        print("Directory '",data_dir,"' not found!")
        print("Files and directories in '", cwd, "' :") 
        print(listdir(cwd))
        quit()

    # create list of all parquet files:
    files_list = [f for f in listdir(data_path) if (isfile(join(data_path, f)) and f.endswith(".parquet"))]

    id_num_list,V_list = ([],[])
    trips  = {}

    for f in files_list:
        f = f.strip("v_.parquet")
        x = f.split("_",1)

        id = x[0].split("V")[0].strip("id")
        id_num_list.append(id)

        V = "V" + x[0].split("V")[1]
        V_list.append(V)

        trip = x[1]
        if V not in trips.keys():
            trips[V]= []
        trips[V].append(trip)

    vehicles = set(V_list)
    ids = set(id_num_list)

    trip_counts = trips.copy()

    for V in trip_counts.keys():
        trip_counts[V] = len(trips[V])

    # Output results:
    print("Volts Database Status:")
    print("-"*50)
    print("Directory:",data_path)
    print("Files:",len(files_list),"parquet files found.")
    print("Unique id values: ", ids)
    print("Total number of vehicles: ", len(vehicles))
    print("Total number of complete trips: ", sum(trip_counts.values()))
    print("-"*50)
    print("Trips per vehicle:")
    for V in trip_counts.keys():
        print("     ",V,": ",trip_counts[V], "complete trips")
    print("-"*50)
    
    return files_list, trips

In [20]:
parquet_folder = Path(DATA_PATH, 'trips_processed_resampled')
all_files, trip_by_vehicle = trip_data_info(parquet_folder)

Volts Database Status:
--------------------------------------------------
Directory: C:\Users\SIEGLEW\OneDrive - Daimler Truck\MA\Code\MA-Data\trips_processed_resampled
Files: 2951 parquet files found.
Unique id values:  {'983'}
Total number of vehicles:  15
Total number of complete trips:  2951
--------------------------------------------------
Trips per vehicle:
      V101 :  213 complete trips
      V102 :  3 complete trips
      V10 :  2 complete trips
      V11 :  52 complete trips
      V12 :  261 complete trips
      V13 :  302 complete trips
      V14 :  575 complete trips
      V15 :  198 complete trips
      V16 :  304 complete trips
      V17 :  262 complete trips
      V18 :  301 complete trips
      V19 :  146 complete trips
      V1 :  141 complete trips
      V2 :  12 complete trips
      V4 :  179 complete trips
--------------------------------------------------


In [21]:
data_dir = parquet_folder
cwd = os.getcwd()

if os.path.exists(os.path.join(cwd,data_dir)):
    data_path = os.path.join(cwd,data_dir)

# Get the shape of the DataFrame (rows, columns)
trip_size = []
for f in all_files:
    trip_rows = pq.read_metadata(os.path.join(data_path,f)).num_rows
    trip_size.append(trip_rows)

    #if pq.read_metadata(os.path.join(data_path,f)).num_columns != 114:
    #    print(f)

trips_sizes = pd.DataFrame(trip_size, all_files)

trips_sizes.columns = ['trip_size']


In [19]:
trips_sizes.sort_values(by='trip_size', ascending=False)

Unnamed: 0,trip_size
v_id983V16_trip216.parquet,513235
v_id983V14_trip300_2.parquet,484381
v_id983V14_trip364_2.parquet,464288
v_id983V14_trip363_2.parquet,462661
v_id983V14_trip358_2.parquet,440238
...,...
v_id983V101_trip74.parquet,1288
v_id983V17_trip241.parquet,1281
v_id983V101_trip19.parquet,1277
v_id983V4_trip128.parquet,1238


In [22]:
trips_sizes.sort_values(by='trip_size', ascending=False)

Unnamed: 0,trip_size
v_id983V14_trip300_2.parquet,71114
v_id983V14_trip364_2.parquet,53322
v_id983V16_trip216.parquet,52616
v_id983V14_trip322.parquet,51167
v_id983V14_trip363_2.parquet,50486
...,...
v_id983V13_trip167.parquet,137
v_id983V101_trip19.parquet,135
v_id983V101_trip74.parquet,131
v_id983V101_trip166.parquet,131


In [18]:
# read through all trips and extract SOC-difference:
trip_soc = []
for f in all_files:
    soc = pd.read_parquet(parquet_folder + "/" + f, engine='fastparquet', columns = ["hv_bat_soc_cval_bms1"])
    d_soc = soc.iloc[soc.last_valid_index()] - soc.iloc[soc.first_valid_index()]    # considering first and last non-NaN value only!
    trip_soc.append(d_soc.values)

all_trips_soc = pd.DataFrame(trip_soc, all_files)
all_trips_soc.insert(1,"trip_size", trips_sizes)
all_trips_soc.columns = ["soc_diff", "trip_size"]

trips_sizes = trips_sizes.sort_values(by=['trip_size'])

In [19]:
# check if SOC-List is complete:
if all_trips_soc.isnull().values.any():
    print('SOC calculation failed for theses files:')
    all_trips_soc.iloc[np.where(all_trips_soc.isnull().values)]

In [20]:
with open('Volts.pickle', 'wb') as handle:
    pickle.dump([all_files, all_trips_soc, trips_sizes, trip_by_vehicle], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:

with open('/home/sieglew/data/Trips_processed/V19_trip43.pickle', 'wb') as handle:
    pickle.dump([all_files, all_trips_soc, trips_sizes, trip_by_vehicle], handle, protocol=pickle.HIGHEST_PROTOCOL)