In [None]:
#If kagglehub is not installed, install kagglehub.
# Otherwise, you can download original data on https://www.kaggle.com/datasets/patrickfleith/nasa-battery-dataset.

!pip install kagglehub

In [None]:
# Download NASA battery dataset via kagglehub.
# Otherwise, you can download original data on https://www.kaggle.com/datasets/patrickfleith/nasa-battery-dataset.

import kagglehub
import os

os.environ["KAGGLEHUB_CACHE"] = os.path.abspath("./NASA_dataset")
path = kagglehub.dataset_download("patrickfleith/nasa-battery-dataset")
print("Path to dataset files:", path)

In [10]:
## data_splitting

from scipy.stats import kurtosis, skew, entropy
import numpy as np
import pandas as pd
import re
import os
import glob

def compute_shannon_entropy(signal, bins=100): # Compute Entropy
    hist, _ = np.histogram(signal, bins=bins, density=True)
    hist = hist[hist > 0]  
    return entropy(hist, base=2)  

def extract_file_number(filename):
    return int(re.findall(r'\d+', filename)[0])

def extract_battery_num(battery_id):
    return battery_id[-2:]

print("Data splitting starts...")

# directory setting
data_dir = os.path.join(os.path.join(path, "cleaned_dataset"),"data")
output_dir = "../data/NASA data"
metadata = pd.read_csv(os.path.join(os.path.join(path,"cleaned_dataset"), "metadata.csv"))
data_type = 'charge' # 'dischrage' or 'charge'

features = [
    'voltage mean', 'voltage std', 'voltage kurtosis', 'voltage skewness',
    'CC Q', 'CC charge time', 'voltage slope', 'voltage entropy',
    'current mean', 'current std', 'current kurtosis', 'current skewness',
    'CV Q', 'CV charge time', 'current slope', 'current entropy',
    'capacity'
]

batches = ['Dataset_05_06_07_18', 'Dataset_25_26_27_28', 'Dataset_29_30_31_32', 'Dataset_33_34_36', 'Dataset_38_39_40', 
           'Dataset_41_42_43_44', 'Dataset_45_46_47_48', 'Dataset_49_50_51_52', 'Dataset_53_54_55_56']

for battery_id in metadata['battery_id'].unique():
    battery_df = metadata[(metadata['battery_id'] == battery_id) & (metadata['type'] == data_type)]

    result = []
    
    batch = None
    
    battery_num = extract_battery_num(battery_id)
    for batch_elem in batches:
        if battery_num in batch_elem:
            batch = batch_elem
            break
    
    if batch == None :
        continue
    
    for _, row in battery_df.iterrows():
        filepath = os.path.join(data_dir, row['filename'])
        charge_num = extract_file_number(row['filename'])

        discharge_rows = metadata[
            (metadata['battery_id'] == battery_id) &
            (metadata['type'] == 'discharge')
        ].copy()

        discharge_rows['file_num'] = discharge_rows['filename'].apply(extract_file_number)
        discharge_rows = discharge_rows.sort_values('file_num')

        valid_future = discharge_rows[
            (discharge_rows['file_num'] > charge_num) &
            (~discharge_rows['Capacity'].isna())
        ]

        if not valid_future.empty:
            capacity = valid_future.iloc[0]['Capacity']
        else:
            capacity = np.nan
        
        try:
            df = pd.read_csv(filepath)
            # Use data of [V_end - 0.2, V_end], where V_end is defined as the 95th percentile of the dataset, since peak values are unstable.
            df = df[(df['Voltage_measured'] > df['Voltage_measured'].quantile(0.95) - 0.2) & (df['Voltage_measured'] < df['Voltage_measured'].quantile(0.95) + 0.2)]
            df[['Voltage_measured', 'Current_measured', 'Time']] = df[
                ['Voltage_measured', 'Current_measured', 'Time']
            ].ffill().bfill()

            if df[['Voltage_measured', 'Current_measured', 'Time']].isnull().any().any():
                print(f"{filepath} skipped: still contains NaN after filling")
                continue

            v = df['Voltage_measured']
            c = df['Current_measured']
            t = df['Time']

            cv_mask = v > v.max() - 0.005
            cc_mask = ~cv_mask

            row_result = {
                'voltage mean': v.mean(),
                'voltage std': v.std(),
                'voltage kurtosis': kurtosis(v),
                'voltage skewness': skew(v),
                'CC Q': np.trapz(c[cc_mask], t[cc_mask]) if cc_mask.any() else np.nan,
                'CC charge time': t[cc_mask].iloc[-1] - t[cc_mask].iloc[0] if cc_mask.sum() > 1 else np.nan,
                'voltage slope': np.gradient(v, t).mean(),
                'voltage entropy': compute_shannon_entropy(v),  
                'current mean': c.mean(),
                'current std': c.std(),
                'current kurtosis': kurtosis(c),
                'current skewness': skew(c),
                'CV Q': np.trapz(c[cv_mask], t[cv_mask]) if cv_mask.any() else np.nan,
                'CV charge time': t[cv_mask].iloc[-1] - t[cv_mask].iloc[0] if cv_mask.sum() > 1 else np.nan,
                'current slope': np.gradient(c, t).mean(),
                'current entropy': compute_shannon_entropy(c),  
                'capacity': capacity
            }

            result.append(row_result)

        except Exception as e:
            print(f"Error in {filepath}: {e}")
            print(df)
    batch_dir = os.path.join(output_dir, batch)
    out_path = os.path.join(batch_dir, f"{battery_id}_{data_type}_summary.csv")
    
    
    if not os.path.exists(batch_dir):
        os.makedirs(batch_dir)
    
    result_df = pd.DataFrame(result[:-1])
    voltage_mean_overall = result_df['voltage mean'].mean()
    result_df = result_df[(result_df['capacity'] != '[]')]
    # result_df = result_df[
    #     (result_df['voltage mean'] >= voltage_mean_overall - 1) &
    #     (result_df['voltage mean'] <= voltage_mean_overall + 1)
    # ]

    result_df.to_csv(out_path, index=False)
    
    print(f"{battery_id}_{data_type}_summary.csv saved in {batch}" )

print("Data split all done.")

Data splitting starts...
B0047_charge_summary.csv saved in Dataset_45_46_47_48
B0045_charge_summary.csv saved in Dataset_45_46_47_48
B0048_charge_summary.csv saved in Dataset_45_46_47_48
B0046_charge_summary.csv saved in Dataset_45_46_47_48
B0043_charge_summary.csv saved in Dataset_41_42_43_44
B0032_charge_summary.csv saved in Dataset_29_30_31_32
B0039_charge_summary.csv saved in Dataset_38_39_40
B0040_charge_summary.csv saved in Dataset_38_39_40
B0029_charge_summary.csv saved in Dataset_29_30_31_32
B0028_charge_summary.csv saved in Dataset_25_26_27_28
B0042_charge_summary.csv saved in Dataset_41_42_43_44
B0034_charge_summary.csv saved in Dataset_33_34_36
B0038_charge_summary.csv saved in Dataset_38_39_40
B0033_charge_summary.csv saved in Dataset_33_34_36
B0030_charge_summary.csv saved in Dataset_29_30_31_32
B0041_charge_summary.csv saved in Dataset_41_42_43_44
B0027_charge_summary.csv saved in Dataset_25_26_27_28
B0044_charge_summary.csv saved in Dataset_41_42_43_44
B0036_charge_summa

In [None]:
## nasa discharging data_processing including dqdv index

print("Data splitting starts...")

# directory setting
data_dir = os.path.join(os.path.join(path, "cleaned_dataset"),"data")
output_dir = "../data/NASA_dqdv data"
metadata = pd.read_csv(os.path.join(os.path.join(path,"cleaned_dataset"), "metadata.csv"))
data_type = 'discharge' # 'dischrage' or 'charge'

features = [
    'voltage mean', 'voltage std', 'voltage kurtosis', 'voltage skewness',
    'CC Q', 'CC charge time', 'voltage slope', 'voltage entropy',
    'current mean', 'current std', 'current kurtosis', 'current skewness',
    'CV Q', 'CV charge time', 'current slope', 'current entropy', 'dqdv'
    'capacity'
]

batches = ['Dataset_05_06_07_18', 'Dataset_25_26_27_28', 'Dataset_29_30_31_32', 'Dataset_33_34_36', 'Dataset_38_39_40', 
           'Dataset_41_42_43_44', 'Dataset_45_46_47_48', 'Dataset_49_50_51_52', 'Dataset_53_54_55_56']

for battery_id in metadata['battery_id'].unique():
    battery_df = metadata[(metadata['battery_id'] == battery_id) & (metadata['type'] == data_type)]

    result = []
    
    batch = None
    
    battery_num = extract_battery_num(battery_id)
    for batch_elem in batches:
        if battery_num in batch_elem:
            batch = batch_elem
            break
    
    if batch == None :
        continue
    
    for _, row in battery_df.iterrows():
        filepath = os.path.join(data_dir, row['filename'])
        charge_num = extract_file_number(row['filename'])

        discharge_rows = metadata[
            (metadata['battery_id'] == battery_id) &
            (metadata['type'] == 'discharge')
        ].copy()

        discharge_rows['file_num'] = discharge_rows['filename'].apply(extract_file_number)
        discharge_rows = discharge_rows.sort_values('file_num')

        valid_future = discharge_rows[
            (discharge_rows['file_num'] > charge_num) &
            (~discharge_rows['Capacity'].isna())
        ]

        if not valid_future.empty:
            capacity = valid_future.iloc[0]['Capacity']
        else:
            capacity = np.nan
        
        try:
            df = pd.read_csv(filepath)
            # Use data of [V_end - 0.2, V_end], where V_end is defined as the 95th percentile of the dataset, since peak values are unstable.
            # df = df[(df['Voltage_measured'] > df['Voltage_measured'].quantile(0.95) - 0.2) & (df['Voltage_measured'] < df['Voltage_measured'].quantile(0.95) + 0.2)]
            df[['Voltage_measured', 'Current_measured', 'Time']] = df[
                ['Voltage_measured', 'Current_measured', 'Time']
            ].ffill().bfill()

            if df[['Voltage_measured', 'Current_measured', 'Time']].isnull().any().any():
                print(f"{filepath} skipped: still contains NaN after filling")
                continue

            v = df['Voltage_measured']
            c = df['Current_measured']
            t = df['Time']

            dvdt = []
            dqdt = []

            for i in range(df.shape[0] - 1):
                dvdt.append((df['Voltage_measured'][i+1] - df['Voltage_measured'][i]) / (df['Time'][i+1] - df['Time'][i]))
                dqdt.append(df['Current_measured'][i])

            dqdv = [dq / dv if dv != 0 else 0 for dq, dv in zip(dqdt, dvdt)]
            dqdv = [i if i > 0 else 0 for i in dqdv]

            dqdv_value = np.quantile(dqdv,0.99)

            cv_mask = v > v.max() - 0.005
            cc_mask = ~cv_mask

            row_result = {
                'voltage mean': v.mean(),
                'voltage std': v.std(),
                'voltage kurtosis': kurtosis(v),
                'voltage skewness': skew(v),
                'CC Q': np.trapz(c[cc_mask], t[cc_mask]) if cc_mask.any() else np.nan,
                'CC charge time': t[cc_mask].iloc[-1] - t[cc_mask].iloc[0] if cc_mask.sum() > 1 else np.nan,
                'voltage slope': np.gradient(v, t).mean(),
                'voltage entropy': compute_shannon_entropy(v),  
                'current mean': c.mean(),
                'current std': c.std(),
                'current kurtosis': kurtosis(c),
                'current skewness': skew(c),
                'CV Q': np.trapz(c[cv_mask], t[cv_mask]) if cv_mask.any() else np.nan,
                'CV charge time': t[cv_mask].iloc[-1] - t[cv_mask].iloc[0] if cv_mask.sum() > 1 else np.nan,
                'current slope': np.gradient(c, t).mean(),
                'current entropy': compute_shannon_entropy(c), 
                'dqdv' : dqdv_value, 
                'capacity': capacity
            }

            result.append(row_result)

        except Exception as e:
            print(f"Error in {filepath}: {e}")
            print(df)
    batch_dir = os.path.join(output_dir, batch)
    out_path = os.path.join(batch_dir, f"{battery_id}_{data_type}_summary.csv")
    
    
    if not os.path.exists(batch_dir):
        os.makedirs(batch_dir)
    
    result_df = pd.DataFrame(result[:-1])
    voltage_mean_overall = result_df['voltage mean'].mean()
    result_df = result_df[(result_df['capacity'] != '[]')]
    # result_df = result_df[
    #     (result_df['voltage mean'] >= voltage_mean_overall - 1) &
    #     (result_df['voltage mean'] <= voltage_mean_overall + 1)
    # ]

    result_df.to_csv(out_path, index=False)
    
    print(f"{battery_id}_{data_type}_summary.csv saved in {batch}" )

print("Data split all done.")

Data splitting starts...
B0047_discharge_summary.csv saved in Dataset_45_46_47_48
B0045_discharge_summary.csv saved in Dataset_45_46_47_48
B0048_discharge_summary.csv saved in Dataset_45_46_47_48
B0046_discharge_summary.csv saved in Dataset_45_46_47_48
B0043_discharge_summary.csv saved in Dataset_41_42_43_44
B0032_discharge_summary.csv saved in Dataset_29_30_31_32
B0039_discharge_summary.csv saved in Dataset_38_39_40
B0040_discharge_summary.csv saved in Dataset_38_39_40
B0029_discharge_summary.csv saved in Dataset_29_30_31_32
B0028_discharge_summary.csv saved in Dataset_25_26_27_28
B0042_discharge_summary.csv saved in Dataset_41_42_43_44
B0034_discharge_summary.csv saved in Dataset_33_34_36
B0038_discharge_summary.csv saved in Dataset_38_39_40
B0033_discharge_summary.csv saved in Dataset_33_34_36
B0030_discharge_summary.csv saved in Dataset_29_30_31_32
B0041_discharge_summary.csv saved in Dataset_41_42_43_44
B0027_discharge_summary.csv saved in Dataset_25_26_27_28
B0044_discharge_summa

In [12]:
# Remove Original dataset
!rm -rf NASA_dataset/