In [1]:
from mappymatch import package_root

In [47]:
import pandas as pd

In [61]:
def preprocess_vehicle_data(vehicle_file, gps_file):
    # Specify data types if known, otherwise set low_memory=False
    vehicle_data = pd.read_csv(vehicle_file, header=0, usecols=[0, 2, 3, 8, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24], parse_dates=[[8, 9, 10, 11, 12, 13]], low_memory=False)
    vehicle_data.drop(vehicle_data.index[0], inplace=True)
    cols = ['Time_abs', 'Time', 'AmbAirTemp', 'BarPressure', 'EngineFuel', 'inclination', 'VehicleSpeed', 'VehicleWeight', 'FileIndex']
    vehicle_data.columns = cols

    gps_data = pd.read_csv(gps_file, low_memory=False)
    gps_data.drop(gps_data.index[0], inplace=True)
    gps_data = gps_data.astype(float)

    merged_data = pd.concat([vehicle_data, gps_data], axis=1)
    
    # Downsample to 1 Hz by taking every 10th row
    merged_data = merged_data.iloc[::10, :]
    
    min_latitude, max_latitude = -90, 90
    min_longitude, max_longitude = -180, 180
    valid_gps = merged_data[(merged_data['gps_Latitude'].between(min_latitude, max_latitude)) & 
                            (merged_data['gps_Longitude'].between(min_longitude, max_longitude))]

    # Convert FileIndex to boolean (True if 1, False otherwise)
    
    valid_gps.loc[:, 'Time_abs'] = valid_gps['Time_abs'].apply(lambda x: x if '.' in x else x + '.0')
    valid_gps.loc[:, 'Time_abs'] = pd.to_datetime(valid_gps['Time_abs'], format='%Y %m %d %H %M %S.%f', errors='coerce')
    valid_gps.loc[:, ['Time', 'AmbAirTemp', 'BarPressure', 'EngineFuel', 'inclination', 'VehicleSpeed', 'VehicleWeight','FileIndex']] = valid_gps[['Time', 'AmbAirTemp', 'BarPressure', 'EngineFuel', 'inclination', 'VehicleSpeed', 'VehicleWeight','FileIndex']].astype("float")
    valid_gps['FileIndex'] = valid_gps['FileIndex'] == 1
    #     valid_gps.loc[:, 'VehicleWeight'] = valid_gps['VehicleWeight'].apply(lambda x: 12500 if x < 12500 else (36500 if x > 36500 else x))
    

    return valid_gps


In [69]:
def convert_to_trip_based(df, time_gap_threshold=2.0):
    df['time_diff'] = df['Time_abs'].diff().dt.total_seconds()

    # Identify start of a new trip
    df['trip_start'] = df['FileIndex'] | (df['time_diff'] > time_gap_threshold)
    df['trip_id'] = df['trip_start'].cumsum()

    # Aggregate data for each trip
    trip_data = df.groupby('trip_id').agg({
        'Time_abs': ['first', 'last', lambda x: (x.max() - x.min()).total_seconds()],
        'gps_Latitude': lambda x: list(x),
        'gps_Longitude': lambda x: list(x),
        'gps_Altitude': list,
        'VehicleSpeed': list,
        'VehicleWeight': 'first',
        'EngineFuel': lambda x: sum(x) / 3600,  # Converting fuel rate from l/h to total liters
        'AmbAirTemp': 'first'
    })

    # Renaming columns
    trip_data.columns = ['trip_start_time', 'trip_end_time', 'travel_time', 'latitudes', 'longitudes', 'altitude_profile', 'velocity_profile', 'weight', 'total_fuel', 'ambTemperature']

    # Combine latitude and longitude into trajectory
    trip_data['trajectory'] = trip_data.apply(lambda row: list(zip(row['latitudes'], row['longitudes'])), axis=1)

    # Drop the separate latitude and longitude columns
    trip_data.drop(['latitudes', 'longitudes'], axis=1, inplace=True)

    return trip_data

In [70]:
# Example usage
vehicle_file = "data/exampleOBD/TL5-231_2021W6.csv"
gps_file = "data/exampleOBD/TL5-231_2021W6_gps.csv"

# Preprocess and downsample data
processed_data = preprocess_vehicle_data(vehicle_file, gps_file)

# Convert to trip-based data
trip_based_data = convert_to_trip_based(processed_data)
print(trip_based_data.head())


  vehicle_data = pd.read_csv(vehicle_file, header=0, usecols=[0, 2, 3, 8, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24], parse_dates=[[8, 9, 10, 11, 12, 13]], low_memory=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_gps['FileIndex'] = valid_gps['FileIndex'] == 1


                    trip_start_time               trip_end_time  travel_time  \
trip_id                                                                        
0               2021-02-01 02:27:49         2021-02-01 04:02:59       5710.0   
1               2021-02-01 04:03:00         2021-02-01 04:03:46         46.0   
2        2021-02-01 04:14:48.800000  2021-02-01 10:46:00.800000      23472.0   
3        2021-02-01 12:00:00.200000  2021-02-01 14:15:34.200000       8134.0   
4        2021-02-01 14:24:23.400000  2021-02-01 15:15:51.400000       3088.0   

                                          altitude_profile  \
trip_id                                                      
0        [530.5, 532.4, 532.4, 532.1, 532.0, 532.0, 532...   
1        [379.5, 379.8, 380.9, 381.3, 381.4, 381.4, 381...   
2        [376.0, 376.0, 376.1, 376.1, 376.1, 376.1, 376...   
3        [448.2, 446.9, 444.6, 443.7, 443.1, 443.3, 443...   
4        [332.2, 332.0, 331.8, 331.8, 331.8, 331.7, 331...   

    

In [71]:
processed_data

Unnamed: 0,Time_abs,Time,AmbAirTemp,BarPressure,EngineFuel,inclination,VehicleSpeed,VehicleWeight,FileIndex,gps_Altitude,gps_Latitude,gps_Longitude,time_diff,trip_start,trip_id
13101,2021-02-01 02:27:49,1310.0,-0.09375,98.0,3.65,-2.205,3.0742,8000.0,False,530.5,44.980090,-93.218245,,False,0
13111,2021-02-01 02:27:50,1311.0,-0.09375,98.0,2.95,-1.851,2.957,8000.0,False,532.4,44.979935,-93.218257,1.0,False,0
13121,2021-02-01 02:27:51,1312.0,-0.09375,98.0,2.8,-1.99,0.0,8000.0,False,532.4,44.979863,-93.218250,1.0,False,0
13131,2021-02-01 02:27:52,1313.0,-0.09375,98.0,2.9,-1.254,0.0,8000.0,False,532.1,44.979846,-93.218225,1.0,False,0
13141,2021-02-01 02:27:53,1314.0,-0.09375,98.0,5.35,-0.954,0.0,8000.0,False,532.0,44.979833,-93.218214,1.0,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360681,2021-02-05 09:39:11.900000,136070.0,-11.594,96.0,11.65,1.66,0.0,8000.0,False,288.4,44.982068,-93.219102,1.0,False,29
1360691,2021-02-05 09:39:12.900000,136070.0,-11.625,96.0,8.85,1.662,0.0,8000.0,False,288.2,44.982068,-93.219104,1.0,False,29
1360701,2021-02-05 09:39:13.900000,136070.0,-11.625,96.0,11.2,1.68,0.0,8000.0,False,288.0,44.982067,-93.219103,1.0,False,29
1360711,2021-02-05 09:39:14.900000,136070.0,-11.625,96.0,11.7,1.676,0.0,8000.0,False,287.7,44.982068,-93.219104,1.0,False,29


In [74]:
def process_and_save_trip_data(vehicle_file, gps_file, output_csv_file):
    # Preprocess and downsample data
    processed_data = preprocess_vehicle_data(vehicle_file, gps_file)

    # Convert to trip-based data
    trip_based_data = convert_to_trip_based(processed_data)

    # Save trip_data to a CSV file
    trip_based_data.to_csv(output_csv_file, index=False)

    print(f"Trip data saved to {output_csv_file}")

In [75]:
process_and_save_trip_data('data/exampleOBD/TL5-231_2021W6.csv', 'data/exampleOBD/TL5-231_2021W6_gps.csv', 'results/output_trip_data.csv')

  vehicle_data = pd.read_csv(vehicle_file, header=0, usecols=[0, 2, 3, 8, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24], parse_dates=[[8, 9, 10, 11, 12, 13]], low_memory=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_gps['FileIndex'] = valid_gps['FileIndex'] == 1


Trip data saved to results/output_trip_data.csv


In [76]:
import ast

def read_trip_data(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Convert string representations of lists back to lists
    df['trajectory'] = df['trajectory'].apply(ast.literal_eval)
    df['velocity_profile'] = df['velocity_profile'].apply(ast.literal_eval)
    df['altitude_profile'] = df['altitude_profile'].apply(ast.literal_eval)

    return df

# Example usage
trip_data_read = read_trip_data('results/output_trip_data.csv')
print(trip_data_read.head())

              trip_start_time               trip_end_time  travel_time  \
0         2021-02-01 02:27:49         2021-02-01 04:02:59       5710.0   
1         2021-02-01 04:03:00         2021-02-01 04:03:46         46.0   
2  2021-02-01 04:14:48.800000  2021-02-01 10:46:00.800000      23472.0   
3  2021-02-01 12:00:00.200000  2021-02-01 14:15:34.200000       8134.0   
4  2021-02-01 14:24:23.400000  2021-02-01 15:15:51.400000       3088.0   

                                    altitude_profile  \
0  [530.5, 532.4, 532.4, 532.1, 532.0, 532.0, 532...   
1  [379.5, 379.8, 380.9, 381.3, 381.4, 381.4, 381...   
2  [376.0, 376.0, 376.1, 376.1, 376.1, 376.1, 376...   
3  [448.2, 446.9, 444.6, 443.7, 443.1, 443.3, 443...   
4  [332.2, 332.0, 331.8, 331.8, 331.8, 331.7, 331...   

                                    velocity_profile   weight  total_fuel  \
0  [3.0742, 2.957, 0.0, 0.0, 0.0, 1.875, 3.0195, ...   8000.0   52.933806   
1  [34.574, 39.828, 44.84, 48.57, 48.609, 48.73, ...  20000.0   

In [None]:
import os
import re

def process_all_files_in_directory(base_dir):
    # Construct the path for the output folder
    output_base_dir = os.path.join(os.path.dirname(base_dir), 'tripsData (Murphy)')

    # Create the output folder if it does not exist
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)

    # Walk through all files and subdirectories in the base directory
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            # Check if the file is a vehicle data file
            if re.match(r'TL\d+-\d+_\d+W\d+\.csv', file) and not 'gps' in file:
                vehicle_file = os.path.join(root, file)
                gps_file = vehicle_file.replace('.csv', '_gps.csv')

                # Check if the corresponding GPS file exists
                if os.path.exists(gps_file):
                    # Construct the output file path in the output folder
                    output_file_name = os.path.basename(vehicle_file).replace('.csv', '_trip_data.csv')
                    output_csv_file = os.path.join(output_base_dir, output_file_name)

                    process_and_save_trip_data(vehicle_file, gps_file, output_csv_file)
                    print(f"Processed {vehicle_file} and {gps_file}, saved to {output_csv_file}")
                else:
                    print(f"GPS file not found for {vehicle_file}")

# Example usage
base_directory = "./Baseline Data (Murphy)"
process_all_files_in_directory(base_directory)