# Dataset Generation using emobpy for Training Data

In [None]:
# Initialize seed
from emobpy.tools import set_seed
set_seed()
from emobpy import Mobility, DataBase, Consumption, HeatInsulation, BEVspecs, Availability, Charging
import random
import os
import pandas as pd
import numpy as np
import re

## Init

In [None]:
config_folder='config_files'
num_users = 1000  # Number of users (EVs)
simulation_hours = 8760
time_step = 0.25
output_final_csv = "val_ev_data.csv"

In [None]:
ev_models = [
    ("Audi", "e-tron 55 quattro", 2019),
    ("Audi", "e-tron 55 quattro", 2020),
    ("Audi", "e-tron 55 quattro Premium Plus", 2019),
    ("Audi", "e-tron 55 quattro Prestige", 2019),
    ("Audi", "e-tron Sportback 50 quattro", 2020),
    ("Audi", "e-tron Sportback 55 quattro", 2020),
    
    ("BMW", "i3 22 kWh", 2014),
    ("BMW", "i3 22 kWh", 2015),
    ("BMW", "i3 22 kWh", 2016),
    ("BMW", "i3 33 kWh", 2017),
    ("BMW", "i3 33 kWh", 2018),
    ("BMW", "i3 42 kWh", 2019),
    ("BMW", "i3s 33 kWh", 2018),
    ("BMW", "i3s 42 kWh", 2019),
    ("BMW", "i3s Edition RoadStyle 42 kWh", 2020),
    
    ("Chevrolet", "Bolt EV", 2017),
    ("Chevrolet", "Bolt EV", 2018),
    ("Chevrolet", "Bolt EV", 2019),
    ("Chevrolet", "Bolt EV", 2020),
    ("Chevrolet", "Spark EV", 2014),
    ("Chevrolet", "Spark EV", 2015),
    ("Chevrolet", "Spark EV", 2016),
    
    ("FIAT", "500e", 2013),
    ("FIAT", "500e", 2016),
    ("FIAT", "500e", 2019),
    
    ("Hyundai", "IONIQ Electric 28 kWh", 2019),
    ("Hyundai", "IONIQ Electric 38.3 kWh", 2020),
    ("Hyundai", "KONA Electric 64 kWh", 2019),
    
    ("Jaguar", "I-Pace", 2019),
    ("Jaguar", "I-Pace S EV400 AWD Automatic", 2020),
    
    ("KIA", "Soul EV", 2015),
    ("KIA", "Soul EV", 2016),
    ("KIA", "Soul EV", 2017),
    ("KIA", "Soul EV", 2018),
    ("KIA", "Soul EV", 2019),
    ("KIA", "Soul EV 39 kWh", 2020),
    ("KIA", "Soul EV 64 kWh", 2020),
    ("KIA", "e-Niro 39 kWh", 2019),
    ("KIA", "e-Niro 4", 2020),
    ("KIA", "e-Niro 64 kWh", 2019),
    
    ("MINI", "Cooper SE Level I", 2020),
    ("MINI", "Cooper SE Level II", 2020),
    ("MINI", "Cooper SE Level III", 2020),
    
    ("Nissan", "Leaf S", 2013),
    ("Nissan", "Leaf S", 2014),
    ("Nissan", "Leaf S", 2015),
    ("Nissan", "Leaf S", 2016),
    ("Nissan", "Leaf S", 2017),
    ("Nissan", "Leaf S", 2018),
    ("Nissan", "Leaf S", 2019),
    ("Nissan", "Leaf S Plus", 2019),
    ("Nissan", "Leaf SL", 2011),
    ("Nissan", "Leaf SL", 2012),
    ("Nissan", "Leaf SL", 2013),
    ("Nissan", "Leaf SL", 2014),
    ("Nissan", "Leaf SL", 2015),
    ("Nissan", "Leaf SL", 2016),
    ("Nissan", "Leaf SL", 2017),
    ("Nissan", "Leaf SL", 2018),
    ("Nissan", "Leaf SL", 2019),
    ("Nissan", "Leaf SL Plus", 2019),
    ("Nissan", "Leaf SV", 2011),
    ("Nissan", "Leaf SV", 2012),
    ("Nissan", "Leaf SV", 2013),
    ("Nissan", "Leaf SV", 2014),
    ("Nissan", "Leaf SV", 2015),
    ("Nissan", "Leaf SV", 2016),
    ("Nissan", "Leaf SV", 2017),
    ("Nissan", "Leaf SV", 2018),
    ("Nissan", "Leaf SV", 2019),
    ("Nissan", "Leaf SV Plus", 2019),
    
    ("Renault", "Zoe Q210", 2013),
    ("Renault", "Zoe Q90", 2017),
    ("Renault", "Zoe Q90", 2019),
    
    ("Tesla", "Model 3 Long Range AWD", 2018),
    ("Tesla", "Model 3 Long Range AWD", 2019),
    ("Tesla", "Model 3 Long Range AWD", 2020),
    ("Tesla", "Model 3 Long Range RWD", 2017),
    ("Tesla", "Model 3 Long Range RWD", 2018),
    ("Tesla", "Model 3 Long Range RWD", 2019),
    ("Tesla", "Model 3 Mid Range RWD", 2018),
    ("Tesla", "Model 3 Mid Range RWD", 2019),
    ("Tesla", "Model 3 Performance AWD", 2018),
    ("Tesla", "Model 3 Performance AWD", 2019),
    ("Tesla", "Model 3 Performance AWD", 2020),
    ("Tesla", "Model 3 Standard Range Plus RWD", 2019),
    ("Tesla", "Model 3 Standard Range Plus RWD", 2020),
    
    ("Volkswagen", "e-Golf", 2014),
    ("Volkswagen", "e-Golf Limited Edition", 2015),
    ("Volkswagen", "e-Golf Limited Edition", 2017),
    ("Volkswagen", "e-Golf SE", 2016),
    ("Volkswagen", "e-Golf SE", 2017),
    ("Volkswagen", "e-Golf SE", 2019),
    ("Volkswagen", "e-Golf SE", 2020),
    ("Volkswagen", "e-Golf SEL Premium", 2015),
    ("Volkswagen", "e-Golf SEL Premium", 2016),
    ("Volkswagen", "e-Golf SEL Premium", 2017),
    ("Volkswagen", "ID.3", 2020),
]

station_distribution = {                  # Dictionary with charging stations type probability distribution per the purpose of the trip (location or destination)
    'prob_charging_point': {
        'errands': {'public': 0.5, 'none': 0.5},
        'escort': {'public': 0.5, 'none': 0.5},
        'leisure': {'public': 0.5, 'none': 0.5},
        'shopping': {'public': 0.5, 'none': 0.5},
        'home': {'public': 0.0,'home':1.0, 'none': 0.0}, #if at home, will always find a charger (personnal one) : assumption
        'workplace': {'public': 0.1, 'workplace': 0.9, 'none': 0.0},   # If the vehicle is at the workplace, it will always find a charging station available (assumption)
        'driving': {'none': 0.99, 'fast75': 0.005, 'fast150': 0.005}}, # with the low probability given to fast charging is to ensure fast charging only for very long trips (assumption)
    'capacity_charging_point': {                                       # Nominal power rating of charging station in kW
        'public': 22,
        'home': 3.7,
        'workplace': 11,
        'none': 0,  # dummy station
        'fast75': 75,
        'fast150': 150}
}

In [None]:
rules = ["user_defined","commuter"]

In [None]:
# Ensure output directories exist
os.makedirs("db", exist_ok=True)
os.makedirs("users_csv",exist_ok=True)
os.makedirs("final_csv",exist_ok=True)
DB = DataBase('db')

## Profiles generation

In [None]:
for i in range(174, num_users+174):
    
    # Mobility
    m = Mobility(config_folder='config_files')
    rule = random.choice(rules)
    m.set_params(
             name_prefix=f"EV_{i}",
             total_hours=simulation_hours, 
             time_step_in_hrs=time_step, # 15 minutes
             category=rule,
             reference_date="01/01/2019"
            )
    m.set_stats(
            stat_ntrip_path="TripsPerDay.csv",
            stat_dest_path="DepartureDestinationTrip.csv",
            stat_km_duration_path="DistanceDurationTrip.csv",
            )
    m.set_rules(rule_key=rule)
    m.run()
    m.save_profile(folder="db")
    
    DB.update()
    DB.loadfiles()

    # Consumption
    mname = m.name
    HI = HeatInsulation(True)
    BEVS = BEVspecs()
    EV_chosen = BEVS.model(random.choice(ev_models))
    c = Consumption(mname, EV_chosen)
    c.load_setting_mobility(DB)
    c.run(
    heat_insulation=HI,
    weather_country='DE',
    weather_year=2016,
    passenger_mass=75,                   # kg
    passenger_sensible_heat=70,          # W
    passenger_nr=1.5,                    # Passengers per vehicle including driver
    air_cabin_heat_transfer_coef=20,     # W/(m2K). Interior walls
    air_flow = 0.02,                     # m3/s. Ventilation
    driving_cycle_type='WLTC',           # Two options "WLTC" or "EPA"
    road_type=0,                         # For rolling resistance, Zero represents a new road.
    road_slope=0
    )
    c.save_profile('db')
    
    DB.update()
    DB.loadfiles()
    # Grid availability
    cname= c.name
    ga = Availability(cname, DB)
    ga.set_scenario(station_distribution)
    ga.run()
    ga.save_profile('db')
    
    DB.update()
    DB.loadfiles()
    # Grid electricity demand
    aname = ga.name
    ged = Charging(aname)
    ged.load_scenario(DB)
    ged.set_sub_scenario('immediate')
    ged.run()
    ged.save_profile('db')
    
    ged.timeseries.to_csv(f"users_csv/EV_{i}_charging.csv")
    DB.update()
    DB.loadfiles()

## Dataset creation

In [None]:
files = os.listdir('users_csv/')
files = [f for f in files if f.endswith('.csv')]
    
for file in files :
    match = re.search(r'(EV_\d+)__?charging\.csv', file)
    file_path1 = os.path.join('users_csv/', file)
    df1 = pd.read_csv(file_path1, parse_dates=["date"])
    
    driving_df = df1[df1['state']=='driving']
    charging_ts = df1.copy()
    
    charging_ts['prev_cap'] = charging_ts['charging_cap'].shift(1).fillna(0)
    charging_ts['next_cap'] = charging_ts['charging_cap'].shift(-1).fillna(0)

    charging_ts['plug_in'] = (charging_ts['charging_cap'] > 0) & (charging_ts['prev_cap'] == 0)
    charging_ts['plug_out'] = (charging_ts['charging_cap'] > 0) & (charging_ts['next_cap'] == 0)

    charging_ts['session_id'] = charging_ts['plug_in'].cumsum()

    charging_sessions = charging_ts[charging_ts['charging_cap'] > 0].copy()
    
    charging_sessions = charging_sessions.groupby(['session_id']).agg(
    plug_in_datetime=('date', 'first'),
    plug_out_datetime=('date', 'last')
    ).reset_index()
    charging_sessions['connected_duration'] = (charging_sessions['plug_out_datetime'] - charging_sessions['plug_in_datetime']).dt.total_seconds() / 3600
    
    # Ensure data is sorted by date
    charging_ts = charging_ts.sort_values(by=['session_id', 'date'])

    # Compute time difference in hours between consecutive timestamps
    charging_ts['time_diff'] = charging_ts.groupby('session_id')['date'].diff().dt.total_seconds().fillna(0) / 3600

    # Compute energy charged (Power × Time)
    charging_ts['energy_battery'] = charging_ts['charge_battery'] * charging_ts['time_diff']
    charging_ts['energy_grid'] = charging_ts['charge_grid'] * charging_ts['time_diff']

    # Sum energy charged per session
    energy_charged = charging_ts.groupby('session_id').agg(
        total_energy_battery=('energy_battery', 'sum'),
        total_energy_grid=('energy_grid', 'sum')
    ).reset_index()

    # Merge back to charging_sessions
    charging_sessions = charging_sessions.merge(energy_charged, on='session_id', how='left')
    
    charging_durations = []
    charging_caps = []
    places = []
    arrival_SoCs = []
    departure_SoCs = []

    for _, row in charging_sessions.iterrows():
        plug_in = row['plug_in_datetime']
        plug_out = row['plug_out_datetime']

        session_data = charging_ts[(charging_ts['date'] >= plug_in) & (charging_ts['date'] <= plug_out)]

        # Charging duration: sum of time differences where charge_battery > 0
        session_data = session_data.sort_values(by='date')  # Ensure correct order
        session_data['time_diff'] = session_data['date'].diff().dt.total_seconds().fillna(0) / 3600  # Convert to hours

        charging_duration = session_data[session_data['charge_battery'] > 1e-12]['time_diff'].sum()
        charging_durations.append(charging_duration)

        # Maximum charging capacity during the session
        charging_caps.append(session_data['charging_cap'].max())
        places.append(session_data['charging_point'].unique()[0])

        arrival_SoC = charging_ts[charging_ts['date'] == plug_in]['actual_soc'].values
        arrival_SoCs.append(arrival_SoC[0] if len(arrival_SoC) > 0 else None)

        # Find actual_SoC at plug-out (departure SoC)
        departure_SoC = charging_ts[charging_ts['date'] == plug_out]['actual_soc'].values
        departure_SoCs.append(departure_SoC[0] if len(departure_SoC) > 0 else None)

    # Add computed values to the dataset
    charging_sessions['charging_duration'] = charging_durations
    charging_sessions['charging_cap'] = charging_caps
    charging_sessions['place'] = places
    charging_sessions['arrival_SoC'] = arrival_SoCs
    charging_sessions['departure_SoC'] = departure_SoCs
    
    charging_sessions = charging_sessions.sort_values(by='plug_in_datetime')
    driving_df = driving_df.sort_values(by='date')

    charging_sessions['prev_plug_out'] = charging_sessions['plug_out_datetime'].shift(1)
    charging_sessions['HBS'] = (charging_sessions['plug_in_datetime'] - charging_sessions['prev_plug_out']).dt.total_seconds() /3600
    charging_sessions['HBS'].fillna(charging_sessions['HBS'].median(),inplace=True)


    driving_consumption = []
    distance_tot = []
    # Iterate over charging sessions
    for i, row in charging_sessions.iterrows():
        if pd.notna(row['prev_plug_out']):  # Skip first session (no previous session)
            # Select driving data between previous plug-out and current plug-in
            mask = (driving_df['date'] > row['prev_plug_out']) & (driving_df['date'] < row['plug_in_datetime'])
            total_consumption = driving_df.loc[mask, 'consumption'].sum()
            total_distance = driving_df.loc[mask, 'distance'].sum()
        else:
            total_consumption = np.nan 
            total_distance = np.nan

        driving_consumption.append(total_consumption)
        distance_tot.append(total_distance)

    # Add computed consumption to charging_sessions
    charging_sessions['CBS'] = driving_consumption
    charging_sessions['CBS'].fillna(charging_sessions['CBS'].median(),inplace=True)


    charging_sessions['DBS'] = distance_tot
    charging_sessions['DBS'].fillna(charging_sessions['DBS'].median(),inplace=True)


    cols_to_replace = ['total_energy_battery', 'total_energy_grid']  
    charging_sessions[cols_to_replace] = charging_sessions[cols_to_replace].mask(charging_sessions[cols_to_replace].abs() < 1e-10, 0)
    charging_sessions.to_csv(f'final_csv/{match.group(1)}_charging_sessions.csv')

## Group in one dataset

In [None]:
files = os.listdir('final_csv/')
files = [f for f in files if f.endswith('.csv')]

In [None]:
df_list = []

for file in files:
    # Extract user_id from filename using regex
    match = re.search(r'(EV_\d+)__?charging_sessions\.csv', file)
    if match:
        user_id = match.group(1)  # Convert to integer

        df = pd.read_csv('final_csv/'+file)
        df['user_id'] = user_id
        
        # Append to the list
        df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)


Plug-in and out times have in hour and minutes as level of detail

In [None]:
final_df['plug_in_datetime'] = pd.to_datetime(final_df['plug_in_datetime'])
final_df['plug_out_datetime'] = pd.to_datetime(final_df['plug_out_datetime'])

final_df['day_type'] = final_df['plug_in_datetime'].dt.dayofweek.map(
    lambda x: 'Saturday' if x == 5 else ('Sunday' if x == 6 else 'Weekday')
)

final_df['plug_in_time'] = final_df['plug_in_datetime'].dt.hour + final_df['plug_in_datetime'].dt.minute / 60
final_df['plug_out_time'] = final_df['plug_out_datetime'].dt.hour + final_df['plug_out_datetime'].dt.minute / 60


final_df = final_df.sort_values(by=['user_id', 'plug_in_datetime'])
final_df['next_CBS'] = final_df.groupby('user_id')['CBS'].shift(-1)
final_df['next_DBS'] = final_df.groupby('user_id')['DBS'].shift(-1)
final_df['next_DBS'].fillna(final_df['next_DBS'].median(),inplace=True)
final_df['next_CBS'].fillna(final_df['next_CBS'].median(),inplace=True)

final_df = final_df.sort_values(by=['user_id', 'plug_in_datetime'])
final_df['next_dest'] = final_df.groupby('user_id')['place'].shift(-1)
final_df['next_dest'].fillna(final_df['next_dest'].mode()[0],inplace=True)

In [None]:
final_df.info()

In [None]:
final_df.describe()

In [None]:
final_df['user_id'].nunique()

In [None]:
final_df.to_csv(output_final_csv, index=False)