# Feature Engineering - 0629_08-10-35 (consider all features)

### Overview
This notebook serves a simple utility purpose to take a single data log with all network signals considered as features and does the initial preprocessing to cleaned & raw so it can be used by the LSTM modeling notebook. 

In this case we take the initial coverted raw csv file from the blf conversion contained in /can_data/*.csv and perform LSTM preprocessing and store to the  /can_data/lstm_processed folder. The file name can be changed to perform this lstm preprocessing on any other raw network data csv file by simply changing the filename below. 

LSTM Preprocessing 
- Load data frame log
- Set time as index and downsample to 1s
- Drop bad signals
- Clean column names to just include signal name
- Backfill NaN data
- Elimnate duplicate columns

In [5]:
# Import all the necessary libraries to be used by this script

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path

from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import confusion_matrix, roc_curve, auc, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Resolve folder paths before reading data
RAW_CSV_DIR = Path("can_data")
PREPROCESSED_CSV_DIR = Path("can_data/lstm_preprocessed") 

In [6]:
# Load cleaned  data from 0629_35 - 1.5 hours, 1.5 cycles, IBS signals only

path = RAW_CSV_DIR / "12V Management  Discharge and Charge Cycle 2025-06-29_08-10-35.csv"
path_cleaned_raw = PREPROCESSED_CSV_DIR / "12V Management  Discharge and Charge Cycle 2025-06-29_08-10-35_cleaned_raw.csv"

# Most efficient: parse the datetime column on read and set as the index
df = pd.read_csv(
    path,
    parse_dates=["time_utc_iso"],    # let pandas parse ISO8601 (handles 'Z' -> UTC)
)

# Ensure timezone-aware UTC (if the parsed datetimes are naive)
if df["time_utc_iso"].dt.tz is None:
    # If strings had a trailing 'Z', pandas usually sets UTC automatically; otherwise, localize:
    df["time_utc_iso"] = df["time_utc_iso"].dt.tz_localize("UTC")

# Set as index and sort
df = df.set_index("time_utc_iso").sort_index()

df = df.drop(columns=["time_et_iso", "excel_utc"], errors="ignore") # drop not needed time columns

print(df.index[:5])
print(df.index.dtype)   # should show datetime64[ns, UTC]
print(df.shape)


  df = pd.read_csv(


DatetimeIndex(['2025-06-29 12:10:35+00:00', '2025-06-29 12:10:36+00:00',
               '2025-06-29 12:10:37+00:00', '2025-06-29 12:10:38+00:00',
               '2025-06-29 12:10:39+00:00'],
              dtype='datetime64[ns, UTC]', name='time_utc_iso', freq=None)
datetime64[ns, UTC]
(4734, 3078)


In [7]:
df.head()

Unnamed: 0_level_0,VCU_0x214.VCU_214_234_CheckSum,VCU_0x214.VCU_RemWakeUpEndFlg,VCU_0x214.VCU_BrkLampCtrlSts,VCU_0x214.VCU_RdyLamp,VCU_0x214.VCU_214_234_AliveCounter,VCU_0x214.VCU_GearSigVld,VCU_0x214.VCU_ShiftMisoper,VCU_0x214.VCU_StgyGearSig,VCU_0x214.VCU_APSPercVld,VCU_0x214.VCU_ParkRdy,...,Diag_OBC_Resp.OBC_Byte7_Resp,Diag_OBC_Resp.OBC_Byte8_Resp,Diag_PVIU_Resp.PVIU_TPCI_Resp,Diag_PVIU_Resp.PVIU_Byte2_Resp,Diag_PVIU_Resp.PVIU_Byte3_Resp,Diag_PVIU_Resp.PVIU_Byte4_Resp,Diag_PVIU_Resp.PVIU_Byte5_Resp,Diag_PVIU_Resp.PVIU_Byte6_Resp,Diag_PVIU_Resp.PVIU_Byte7_Resp,Diag_PVIU_Resp.PVIU_Byte8_Resp
time_utc_iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-06-29 12:10:35+00:00,176,remote_control_not_ended,No_request,Ready,0,Valid,Normal,gear_P,Valid,Not_ready,...,,,,,,,,,,
2025-06-29 12:10:36+00:00,216,remote_control_not_ended,No_request,Ready,10,Valid,Normal,gear_P,Valid,Not_ready,...,,,,,,,,,,
2025-06-29 12:10:37+00:00,132,remote_control_not_ended,No_request,Ready,5,Valid,Normal,gear_P,Valid,Not_ready,...,,,,,,,,,,
2025-06-29 12:10:38+00:00,176,remote_control_not_ended,No_request,Ready,0,Valid,Normal,gear_P,Valid,Not_ready,...,,,,,,,,,,
2025-06-29 12:10:39+00:00,216,remote_control_not_ended,No_request,Ready,10,Valid,Normal,gear_P,Valid,Not_ready,...,,,,,,,,,,


In [8]:
# Downsample if not already 1 second

# 1) Numeric columns → median per second
df_num = df.resample("1S").median(numeric_only=True)

# 2) The rest (non-numeric) → take the last value seen each second, then forward-fill gaps
non_num_cols = df.columns.difference(df_num.columns)
df_non = df[non_num_cols].resample("1S").last().ffill()

# 3) Recombine and keep as your working df
df = pd.concat([df_num, df_non], axis=1).sort_index()

print(df.index[:5])
print(df.shape)


  df_num = df.resample("1S").median(numeric_only=True)
  df_non = df[non_num_cols].resample("1S").last().ffill()


DatetimeIndex(['2025-06-29 12:10:35+00:00', '2025-06-29 12:10:36+00:00',
               '2025-06-29 12:10:37+00:00', '2025-06-29 12:10:38+00:00',
               '2025-06-29 12:10:39+00:00'],
              dtype='datetime64[ns, UTC]', name='time_utc_iso', freq='s')
(4734, 3078)


In [9]:
# Drop bad signals

# drop constant cols
df = df.loc[:, df.nunique() > 1]

# drop mostly empty cols
missing_thresh = 0.5
df = df.loc[:, df.isnull().mean() < missing_thresh]

# drop checksums and alivecounters
df = df.drop(columns=df.filter(like="CheckSum").columns)
df = df.drop(columns=df.filter(like="AliveCounter").columns)
df = df.drop(columns=df.filter(like="BMS_CellVolt").columns)

print(df.shape)

(4734, 408)


In [10]:
# Shorten column names to remove the CAN message and only keep the signal name
new_columns = []
for col in df.columns:
    if '.' in col:
        name = col.split('.', 1)[1]
    else:
        name = col
    new_columns.append(name)

df.columns = new_columns 
print(df.columns)

Index(['MCU_F_CrtSpd', 'MCU_F_CrtTq', 'MCU_F_MaxPwrGennTq', 'MCU_F_MaxElecTq',
       'MCU_F_MOT_T', 'MCU_F_IGBTT', 'MCU_R_CrtSpd', 'MCU_R_CrtTq',
       'MCU_R_MaxPwrGennTq', 'MCU_R_MaxElecTq',
       ...
       'VCU_VehMod', 'VCU_LVbattCtrlSigSTGFlt', 'VCU_HVBActivateDeactivateRq',
       'VCU_Set_BMS_Mod', 'IBS_CurrentAutorange', 'VSP_HornWngSts',
       'VSP_WngMod', 'VSP_UsrData1_Bit6_DIAGN_AWAKE', 'WTC_B_HVPwrSplySts',
       'WTC_H_HVPwrSplySts'],
      dtype='object', length=408)


In [11]:
# Handle NaNs 
df = df.interpolate(method="linear").ffill().bfill()

print(df.shape)

  df = df.interpolate(method="linear").ffill().bfill()


(4734, 408)


In [12]:
# Eliminate duplicate columns
dups = df.columns[df.columns.duplicated()]
print(f"Duplicate columns: {len(dups)}")
if len(dups):
    print(dups.unique()[:20])  # peek

Duplicate columns: 10
Index(['GW_CtrlBitVector_Bit0_RMR', 'GW_UsrData1_Bit5_IGNITION_AWAKE',
       'PKC_IMMO_AuthSts', 'PKC_KeySts', 'PKC_RemLockCmd_Key', 'TBOX_4GSigIND',
       'VCU_RdyLamp', 'VCU_VehSt', 'VCU_ChrgSysOperCmd', 'VCU_VehChrgDchgMod'],
      dtype='object')


In [13]:
# Save a raw version for later for SOH1 calculation
df_raw = df.copy()

### Ready to save preprocessed dataframe
- Load data frame log
- Set time as index and downsample to 1s
- Drop bad signals
- Clean column names to just include signal name
- Backfill NaN data
- Elimnate duplicate columns

In [14]:
# Save raw preprocessed data before converting non-numeric (one-hot encoding), scaling, & soh1 calculation
# Save cleaned data file for next steps
df.to_csv(path_cleaned_raw)

In [15]:
df.shape

(4734, 408)