### 1. 기본 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

- 폰트 깨짐 방지

In [2]:
import platform
if platform.system() == 'Windows':
    plt.rc('font', family='Malgun Gothic') 
else:
    
    plt.rc('font', family='NanumGothic')

# 마이너스 기호방지
plt.rcParams['axes.unicode_minus'] = False

In [3]:
# 메타데이터 불러오기
metadata_path = '../raw 원본데이터/metadata.csv'
metadata = pd.read_csv(metadata_path)

In [4]:
# data 불러오기
data_path = '../raw 원본데이터/data'
file_list = os.listdir(data_path)

print(f"파일개수: {len(file_list)}개 ")
print(f"파일명(예시용): {file_list[:3]}") 

파일개수: 7565개 
파일명(예시용): ['00001.csv', '00002.csv', '00003.csv']


### 2. 데이터 전처리 

##### 2-1 공통 전처리

[2-1-1] metadata에서 battery_id가 [5, 6, 7, 18]인 데이터만 추출.

In [5]:
# # 1. 타겟 배터리 선별 (5, 6, 7, 18번)
target_data = metadata[metadata['battery_id'].isin(['B0005','B0006', 'B0007', 'B0018'])] 
target_data.reset_index(inplace=True,drop='True')

[2-1-2] 데이터 타입 변환 (문자열 -> 실수형)

In [6]:
target_data['Capacity'] = target_data['Capacity'].astype('float')
target_data['Re'] = target_data['Re'].astype('float')
target_data['Rct'] = target_data['Rct'].astype('float')

In [7]:
def convert_to_dt(string):
    # 1. 대괄호 제거 후 공백으로 분리하여 숫자 리스트로 변환
    nums = [float(n) for n in string.strip('[]').split()]
    # 2. 연, 월, 일, 시, 분, 초 추출
    # pd.Timestamp를 이용해 바로 변환 (초 단위에 소수점이 있어도 잘 처리됨)
    return pd.Timestamp(year=int(nums[0]), month=int(nums[1]), day=int(nums[2]),
                        hour=int(nums[3]), minute=int(nums[4]), second=int(nums[5]),
                        microsecond=int((nums[5] % 1) * 1e6))

# 적용
target_data['start_time'] = target_data['start_time'].apply(convert_to_dt)
target_data['start_time'] = pd.to_datetime(target_data['start_time'])

[2-2-3] Charge, Discharge, Impedance 데이터 분리

In [8]:
charge_data = target_data[target_data['type'] == 'charge'].reset_index(drop='True')
discharge_data = target_data[target_data['type'] == 'discharge'].reset_index(drop='True')
impedance_data = target_data[target_data['type'] == 'impedance'].reset_index(drop='True')

#### 2-2 Discharge 데이터 전처리

[1] 배터리 별 데이터 분리

In [9]:
discharge_b5 = discharge_data[discharge_data['battery_id']=='B0005'].reset_index(drop=True)
discharge_b6 = discharge_data[discharge_data['battery_id']=='B0006'].reset_index(drop=True)
discharge_b7 = discharge_data[discharge_data['battery_id']=='B0007'].reset_index(drop=True)
discharge_b18 = discharge_data[discharge_data['battery_id']=='B0018'].reset_index(drop=True)

[2]
Time_diff : 이전 discharge 데이터와 start_time 차이<br>
Capacity_diff : 이전 discharge 데이터와 capacity 차이<br>
Discharge_cnt : Discharge가 수행된 횟수

In [10]:
discharge_b5['time_diff'] = discharge_b5.groupby('battery_id')['start_time'].diff()
discharge_b5['Capacity_diff'] = discharge_b5.groupby('battery_id')['Capacity'].diff()
discharge_b5['discharge_cnt'] = discharge_b5.index+1

discharge_b6['time_diff'] = discharge_b6.groupby('battery_id')['start_time'].diff()
discharge_b6['Capacity_diff'] = discharge_b6.groupby('battery_id')['Capacity'].diff()
discharge_b6['discharge_cnt'] = discharge_b6.index+1

discharge_b7['time_diff'] = discharge_b7.groupby('battery_id')['start_time'].diff()
discharge_b7['Capacity_diff'] = discharge_b7.groupby('battery_id')['Capacity'].diff()
discharge_b7['discharge_cnt'] = discharge_b7.index+1

discharge_b18['time_diff'] = discharge_b18.groupby('battery_id')['start_time'].diff()
discharge_b18['Capacity_diff'] = discharge_b18.groupby('battery_id')['Capacity'].diff()
discharge_b18['discharge_cnt'] = discharge_b18.index+1

[3] Discharge 데이터 병합

In [11]:
discharge = pd.concat([discharge_b5,discharge_b6,discharge_b7,discharge_b18], axis=0)
discharge.reset_index(inplace=True, drop=True)

[4] 배터리 별 방전 간 간격 계산

In [12]:
discharge = discharge.sort_values(['battery_id', 'start_time'])
discharge['next_discharge_time'] = discharge.groupby('battery_id')['start_time'].shift(1)
discharge['time_to_next_discharge_hrs'] = ( discharge['start_time'] - discharge['next_discharge_time']).dt.total_seconds() / 3600

#### 2-3 Impdance 데이터 전처리 


[1] 배터리 별 데이터 분리

In [13]:
impedance_b5 = impedance_data[impedance_data['battery_id']=='B0005'].reset_index(drop=True)
impedance_b6 = impedance_data[impedance_data['battery_id']=='B0006'].reset_index(drop=True)
impedance_b7 = impedance_data[impedance_data['battery_id']=='B0007'].reset_index(drop=True)
impedance_b18 = impedance_data[impedance_data['battery_id']=='B0018'].reset_index(drop=True)

Time_diff : 이전 impedance 측정 데이터와 start_time 차이<br>
Re_diff : 이전 impedance 측정 데이터와 Re 차이<br>
Rct_diff : 이전 impedance 측정 데이터와 Rct 차이

In [14]:
impedance_b5['time_diff'] = impedance_b5.groupby('battery_id')['start_time'].diff()
impedance_b5['Re_diff'] = impedance_b5.groupby('battery_id')['Re'].diff()
impedance_b5['Rct_diff'] = impedance_b5.groupby('battery_id')['Rct'].diff()

impedance_b6['time_diff'] = impedance_b6.groupby('battery_id')['start_time'].diff()
impedance_b6['Re_diff'] = impedance_b6.groupby('battery_id')['Re'].diff()
impedance_b6['Rct_diff'] = impedance_b6.groupby('battery_id')['Rct'].diff()

impedance_b7['time_diff'] = impedance_b7.groupby('battery_id')['start_time'].diff()
impedance_b7['Re_diff'] = impedance_b7.groupby('battery_id')['Re'].diff()
impedance_b7['Rct_diff'] = impedance_b7.groupby('battery_id')['Rct'].diff()

impedance_b18['time_diff'] = impedance_b18.groupby('battery_id')['start_time'].diff()
impedance_b18['Re_diff'] = impedance_b18.groupby('battery_id')['Re'].diff()
impedance_b18['Rct_diff'] = impedance_b18.groupby('battery_id')['Rct'].diff()

Impedance 데이터 병합

In [15]:
impedance = pd.concat([impedance_b5, impedance_b6, impedance_b7, impedance_b18], axis=0)
impedance.reset_index(inplace=True, drop=True)

#### Charge 데이터 전처리 

[1] 배터리 별 데이터 분리

In [16]:
charge_b5 = charge_data[impedance_data['battery_id']=='B0005'].reset_index(drop=True)
charge_b6 = charge_data[impedance_data['battery_id']=='B0006'].reset_index(drop=True)
charge_b7 = charge_data[impedance_data['battery_id']=='B0007'].reset_index(drop=True)
charge_b18 = charge_data[impedance_data['battery_id']=='B0018'].reset_index(drop=True)

[2] Time_diff : 이전 charge 측정 데이터와 start_time 차이

In [17]:
charge_b5['time_diff'] = charge_b5.groupby('battery_id')['start_time'].diff()
charge_b6['time_diff'] = charge_b6.groupby('battery_id')['start_time'].diff()
charge_b7['time_diff'] = charge_b7.groupby('battery_id')['start_time'].diff()
charge_b18['time_diff'] = charge_b18.groupby('battery_id')['start_time'].diff()

[3] Charge 데이터 병합

In [18]:
charge = pd.concat([charge_b5, charge_b6, charge_b7, charge_b18], axis=0)
charge.reset_index(inplace=True, drop=True)

#### 2-4 Charge, Discharge, Impedance 데이터 확인 


[1] charge

In [19]:
charge.head()

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct,time_diff
0,charge,2008-05-13 10:47:01.828000,24,B0005,380,5501,05501.csv,,,,NaT
1,charge,2008-05-13 15:43:41.295999,24,B0005,384,5505,05505.csv,,,,0 days 04:56:39.467999
2,charge,2008-05-13 20:27:29.109000,24,B0005,388,5509,05509.csv,,,,0 days 04:43:47.813001
3,charge,2008-05-14 01:22:34.765000,24,B0005,392,5513,05513.csv,,,,0 days 04:55:05.656000
4,charge,2008-05-14 06:18:57.625000,24,B0005,396,5517,05517.csv,,,,0 days 04:56:22.860000


[2] discharge

In [20]:
discharge.head()

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct,time_diff,Capacity_diff,discharge_cnt,next_discharge_time,time_to_next_discharge_hrs
0,discharge,2008-04-02 15:25:41.593000,24,B0005,1,5122,05122.csv,1.856487,,,NaT,,1,NaT,
1,discharge,2008-04-02 19:43:48.405999,24,B0005,3,5124,05124.csv,1.846327,,,0 days 04:18:06.812999,-0.01016,2,2008-04-02 15:25:41.593000,4.301892
2,discharge,2008-04-03 00:01:06.687000,24,B0005,5,5126,05126.csv,1.835349,,,0 days 04:17:18.281001,-0.010978,3,2008-04-02 19:43:48.405999,4.288411
3,discharge,2008-04-03 04:16:37.375000,24,B0005,7,5128,05128.csv,1.835263,,,0 days 04:15:30.688000,-8.7e-05,4,2008-04-03 00:01:06.687000,4.258524
4,discharge,2008-04-03 08:33:25.702999,24,B0005,9,5130,05130.csv,1.834646,,,0 days 04:16:48.327999,-0.000617,5,2008-04-03 04:16:37.375000,4.280091


[3] impedance

In [21]:
impedance.head()

Unnamed: 0,type,start_time,ambient_temperature,battery_id,test_id,uid,filename,Capacity,Re,Rct,time_diff,Re_diff,Rct_diff
0,impedance,2008-04-18 20:55:29.859000,24,B0005,40,5161,05161.csv,,0.044669,0.069456,NaT,,
1,impedance,2008-04-18 22:39:16.312000,24,B0005,42,5163,05163.csv,,0.046687,0.076275,0 days 01:43:46.453000,0.002018,0.006818
2,impedance,2008-04-19 02:14:27.015000,24,B0005,44,5165,05165.csv,,0.044843,0.067972,0 days 03:35:10.703000,-0.001844,-0.008303
3,impedance,2008-04-19 03:57:24.187000,24,B0005,46,5167,05167.csv,,0.046195,0.074534,0 days 01:42:57.172000,0.001351,0.006562
4,impedance,2008-04-19 07:32:33.655999,24,B0005,48,5169,05169.csv,,0.045101,0.068528,0 days 03:35:09.468999,-0.001094,-0.006006


#### 2-5 데이터 병합

[1] 1차 데이터 병합

In [22]:
df = pd.concat([charge, discharge, impedance]).sort_values(['battery_id', 'start_time']).reset_index(drop=True)

[2] battery_id별로 그룹화하여 shift 적용
1. 뒤에 impedacne 갚을 앞의 빈칸으로 가져오기 (shift -1)
2. 바로 앞에 있는 impedance 값을 뒤에 빈칸으로 가져오기 (shift 1)


In [23]:
df['start_time'] = pd.to_datetime(df['start_time'])

# 2. Shift를 이용해 Re, Rct 값을 앞뒤 빈칸에 채우기 (사전 작업)
grouped = df.groupby('battery_id')
df['Re'] = df['Re'].fillna(grouped['Re'].shift(-1)) # 뒤에 있는 값을 앞으로
df['Rct'] = df['Rct'].fillna(grouped['Rct'].shift(-1))
df['Re'] = df['Re'].fillna(grouped['Re'].shift(1))  # 앞에 있는 값을 뒤로
df['Rct'] = df['Rct'].fillna(grouped['Rct'].shift(1))

[3] 결측치 확인

In [24]:
df.isna().sum()

type                             0
start_time                       0
ambient_temperature              0
battery_id                       0
test_id                          0
uid                              0
filename                         0
Capacity                      1531
Re                             334
Rct                            334
time_diff                       14
Capacity_diff                 1535
discharge_cnt                 1531
next_discharge_time           1535
time_to_next_discharge_hrs    1535
Re_diff                       1284
Rct_diff                      1284
dtype: int64

[4] 1차 병합 데이터 처리

In [25]:
# 3. 매칭을 위한 데이터프레임 분리
charge = df[df['type'] == 'charge']
discharge = df[df['type'] == 'discharge']

combined_results = []

# 4. 배터리 ID별로 루프 실행
for b_id in discharge['battery_id'].unique():
    b_dis = discharge[discharge['battery_id'] == b_id]
    b_cha = charge[charge['battery_id'] == b_id]
    
    for _, d_row in b_dis.iterrows():
        d_time = d_row['start_time']
        
        # [충전 매칭] 방전 직전의 가장 최근 충전 찾기
        prev_charges = b_cha[b_cha['start_time'] < d_time]
        if prev_charges.empty: continue
            
        last_c = prev_charges.iloc[-1]
        c_time = last_c['start_time']
        gap_hrs = (d_time - c_time).total_seconds() / 3600
        
        # 충전-방전 간격이 24시간 이내인 정상 사이클만 유지
        if gap_hrs > 24: continue
            
        # [수정된 부분] 임피던스 평균을 구하지 않고, shift로 채워진 현재 행의 값을 바로 사용
        re_val = d_row['Re']
        rct_val = d_row['Rct']
            
        # 결과 리스트에 한 줄로 정리
        combined_results.append({
            'battery_id': b_id,
            'discharge_cnt': int(d_row['discharge_cnt']),
            'test_id': d_row['test_id'],
            'ambient_temperature': d_row['ambient_temperature'],
            'capacity': d_row['Capacity'],
            'capacity_diff': d_row.get('Capacity_diff', np.nan),
            'time_to_next_discharge_hrs': d_row['time_to_next_discharge_hrs'],
            're': re_val,   # 현재 시점의 Re 값
            'rct': rct_val, # 현재 시점의 Rct 값
            'charge_time': c_time,
            'discharge_time': d_time,
            'charge_filename': last_c['filename'],
            'discharge_filename': d_row['filename'],
        })

[5] 2차 데이터 프레임 생성하기

In [26]:
final_df = pd.DataFrame(combined_results)

In [27]:
final_df.head()

Unnamed: 0,battery_id,discharge_cnt,test_id,ambient_temperature,capacity,capacity_diff,time_to_next_discharge_hrs,re,rct,charge_time,discharge_time,charge_filename,discharge_filename
0,B0005,1,1,24,1.856487,,,,,2008-04-02 13:08:17.920999,2008-04-02 15:25:41.593000,05121.csv,05122.csv
1,B0005,2,3,24,1.846327,-0.01016,4.301892,,,2008-04-02 16:37:51.984000,2008-04-02 19:43:48.405999,05123.csv,05124.csv
2,B0005,3,5,24,1.835349,-0.010978,4.288411,,,2008-04-02 20:55:40.811999,2008-04-03 00:01:06.687000,05125.csv,05126.csv
3,B0005,4,7,24,1.835263,-8.7e-05,4.258524,,,2008-04-03 01:12:38.670999,2008-04-03 04:16:37.375000,05127.csv,05128.csv
4,B0005,5,9,24,1.834646,-0.000617,4.280091,,,2008-04-03 05:27:49.125000,2008-04-03 08:33:25.702999,05129.csv,05130.csv


### 3. 피처 엔지니어링

- 충전 파생변수 생성
- CC/CV 시간 추출 
- 최고 온도 및 에너지 추출 

In [28]:
base_path = './cleaned_dataset/data' # 데이터 파일 경로

def extract_all_battery_metrics(row):
    """
    충전(Charge) 파일을 읽어 상세 지표(CC/CV 시간, 용량, 에너지, 평균/최고/최소 온도)를 추출
    """
    res = {
        'charge_cc_time': np.nan, 
        'charge_cv_time': np.nan,
        'charge_total_time' : np.nan,
        'charge_cc_capacity': np.nan, 
        'charge_cv_capacity': np.nan,
        'charge_energy': np.nan,    # [추가] 충전 에너지
        'charge_avg_temp': np.nan,  # 충전 시 평균 온도
        'charge_max_temp': np.nan,  # 충전 시 최고 온도
        'charge_min_temp': np.nan   # 충전 시 최저 온도
    }
    
    cha_file = os.path.join(base_path, str(row['charge_filename']))
    if os.path.exists(cha_file):
        try:
            c_df = pd.read_csv(cha_file)
            v_max = c_df['Voltage_measured'].max()
            
            # CC/CV 전환점 판단
            cv_start_idx = c_df.index[c_df['Voltage_measured'] >= v_max * 0.999][0]
            
            # 구간 나누기
            cc_part = c_df.iloc[:cv_start_idx + 1]
            cv_part = c_df.iloc[cv_start_idx:]
            
            # 시간 및 용량 계산
            res['charge_cc_time'] = c_df['Time'].iloc[cv_start_idx] - c_df['Time'].iloc[0]
            res['charge_cv_time'] = c_df['Time'].iloc[-1] - c_df['Time'].iloc[cv_start_idx]
            res['charge_total_time'] = c_df['Time'].iloc[-1] - c_df['Time'].iloc[0]
            res['charge_cc_capacity'] = np.trapz(np.abs(cc_part['Current_measured']), cc_part['Time']) / 3600
            res['charge_cv_capacity'] = np.trapz(np.abs(cv_part['Current_measured']), cv_part['Time']) / 3600
            
            # [추가] 충전 총 에너지 계산 (Wh = V * I 적분 / 3600)
            power = np.abs(c_df['Voltage_measured'] * c_df['Current_measured'])
            res['charge_energy'] = np.trapz(power, c_df['Time']) / 3600
            
            # 온도 지표 추출 (충전 파일 기준)
            res['charge_avg_temp'] = c_df['Temperature_measured'].mean()
            res['charge_max_temp'] = c_df['Temperature_measured'].max()
            res['charge_min_temp'] = c_df['Temperature_measured'].min()
        except Exception: pass
            
    return pd.Series(res)

[1] 파생 변수 데이터 병합하기

In [29]:
charge_features = final_df.apply(extract_all_battery_metrics, axis=1)
charge_features_df = pd.concat([final_df, charge_features], axis=1)
charge_features_df['charge_capacity'] = charge_features_df['charge_cc_capacity'] + charge_features_df['charge_cv_capacity']

In [30]:
charge_features_df.head()

Unnamed: 0,battery_id,discharge_cnt,test_id,ambient_temperature,capacity,capacity_diff,time_to_next_discharge_hrs,re,rct,charge_time,...,charge_cc_time,charge_cv_time,charge_total_time,charge_cc_capacity,charge_cv_capacity,charge_energy,charge_avg_temp,charge_max_temp,charge_min_temp,charge_capacity
0,B0005,1,1,24,1.856487,,,,,2008-04-02 13:08:17.920999,...,,,,,,,,,,
1,B0005,2,3,24,1.846327,-0.01016,4.301892,,,2008-04-02 16:37:51.984000,...,,,,,,,,,,
2,B0005,3,5,24,1.835349,-0.010978,4.288411,,,2008-04-02 20:55:40.811999,...,,,,,,,,,,
3,B0005,4,7,24,1.835263,-8.7e-05,4.258524,,,2008-04-03 01:12:38.670999,...,,,,,,,,,,
4,B0005,5,9,24,1.834646,-0.000617,4.280091,,,2008-04-03 05:27:49.125000,...,,,,,,,,,,


[2] 방전 파생 변수 생성

In [31]:
def extract_discharge_metrics(row):
    """
    방전(Discharge) 파일을 읽어 상세 지표(시간, 최고/최저 온도, 에너지, 실제용량)를 추출
    """
    res = {
        'discharge_energy': np.nan,
        'discharge_capacity': np.nan,
        'discharge_total_time': np.nan,
        'discharge_avg_temp': np.nan,  # 충전 시 평균 온도
        'discharge_max_temp': np.nan,   # 방전 시 최고 온도
        'discharge_min_temp': np.nan,   # 방전 시 최저 온도
    }
    
    dis_file = os.path.join(base_path, str(row['discharge_filename']))
    if os.path.exists(dis_file):
        try:
            d_df = pd.read_csv(dis_file)
            
            # 방전 시간
            res['discharge_total_time'] = d_df['Time'].iloc[-1] - d_df['Time'].iloc[0]
            
            # 온도 지표 추출 (방전 파일 기준)
            res['discharge_avg_temp'] = d_df['Temperature_measured'].mean()
            res['discharge_max_temp'] = d_df['Temperature_measured'].max()
            res['discharge_min_temp'] = d_df['Temperature_measured'].min()
            
            # 에너지 및 용량 계산
            power = np.abs(d_df['Voltage_measured'] * d_df['Current_measured'])
            res['discharge_energy'] = np.trapz(power, d_df['Time']) / 3600
            res['discharge_capacity'] = np.trapz(np.abs(d_df['Current_measured']), d_df['Time']) / 3600
        except Exception: pass
            
    return pd.Series(res)

[3] 파생 변수 데이터 병합 및 중복 컬럼 제거

In [32]:
discharge_features = final_df.apply(extract_discharge_metrics, axis=1)
derived_features_df = pd.concat([charge_features_df, discharge_features], axis=1)

In [33]:
derived_features_df.head()

Unnamed: 0,battery_id,discharge_cnt,test_id,ambient_temperature,capacity,capacity_diff,time_to_next_discharge_hrs,re,rct,charge_time,...,charge_avg_temp,charge_max_temp,charge_min_temp,charge_capacity,discharge_energy,discharge_capacity,discharge_total_time,discharge_avg_temp,discharge_max_temp,discharge_min_temp
0,B0005,1,1,24,1.856487,,,,,2008-04-02 13:08:17.920999,...,,,,,,,,,,
1,B0005,2,3,24,1.846327,-0.01016,4.301892,,,2008-04-02 16:37:51.984000,...,,,,,,,,,,
2,B0005,3,5,24,1.835349,-0.010978,4.288411,,,2008-04-02 20:55:40.811999,...,,,,,,,,,,
3,B0005,4,7,24,1.835263,-8.7e-05,4.258524,,,2008-04-03 01:12:38.670999,...,,,,,,,,,,
4,B0005,5,9,24,1.834646,-0.000617,4.280091,,,2008-04-03 05:27:49.125000,...,,,,,,,,,,


In [34]:
derived_features_df.columns

Index(['battery_id', 'discharge_cnt', 'test_id', 'ambient_temperature',
       'capacity', 'capacity_diff', 'time_to_next_discharge_hrs', 're', 'rct',
       'charge_time', 'discharge_time', 'charge_filename',
       'discharge_filename', 'charge_cc_time', 'charge_cv_time',
       'charge_total_time', 'charge_cc_capacity', 'charge_cv_capacity',
       'charge_energy', 'charge_avg_temp', 'charge_max_temp',
       'charge_min_temp', 'charge_capacity', 'discharge_energy',
       'discharge_capacity', 'discharge_total_time', 'discharge_avg_temp',
       'discharge_max_temp', 'discharge_min_temp'],
      dtype='object')

In [35]:
derived_features_df['R'] = derived_features_df['re'] + derived_features_df['rct'] # R 컬럼 추가
eol_mapping = derived_features_df.groupby('battery_id')['discharge_cnt'].max().to_dict() # EOL 값 찾기
derived_features_df['RUL'] = derived_features_df.apply(lambda row: eol_mapping[row['battery_id']] - row['discharge_cnt']+1, axis=1) # RUL 컬럼 추가

In [36]:
eol_80_info = {}
for b_id in derived_features_df['battery_id'].unique():
    # 해당 배터리 데이터만 추출
    b_df = derived_features_df[derived_features_df['battery_id'] == b_id].sort_values('discharge_cnt')
    
    # 초기 용량 (첫 번째 사이클의 용량)
    initial_cap = b_df['capacity'].iloc[0]
    # 80% 임계값 설정
    threshold_80 = initial_cap * 0.8
    
    # 용량이 처음으로 80% 이하가 되는 지점(EOL) 찾기
    eol_df = b_df[b_df['capacity'] <= threshold_80]
    
    if not eol_df.empty:
        eol_cycle = eol_df['discharge_cnt'].iloc[0]
    else:
        # 만약 데이터 끝까지 80% 아래로 떨어지지 않는 경우 마지막 사이클을 EOL로 가정
        eol_cycle = b_df['discharge_cnt'].max()
        
    eol_80_info[b_id] = eol_cycle

# 2. RUL_80 컬럼 생성 (EOL 사이클 - 현재 사이클)
# 이미 수명이 다한 경우(음수)는 0으로 처리
derived_features_df['RUL_80'] = derived_features_df.apply(
    lambda x: eol_80_info[x['battery_id']] - x['discharge_cnt'], axis=1
)

In [37]:
derived_features_df = derived_features_df.drop(columns=['charge_time', 'discharge_time'])

[4] 저항값 보간

In [38]:
derived_features_df.columns

Index(['battery_id', 'discharge_cnt', 'test_id', 'ambient_temperature',
       'capacity', 'capacity_diff', 'time_to_next_discharge_hrs', 're', 'rct',
       'charge_filename', 'discharge_filename', 'charge_cc_time',
       'charge_cv_time', 'charge_total_time', 'charge_cc_capacity',
       'charge_cv_capacity', 'charge_energy', 'charge_avg_temp',
       'charge_max_temp', 'charge_min_temp', 'charge_capacity',
       'discharge_energy', 'discharge_capacity', 'discharge_total_time',
       'discharge_avg_temp', 'discharge_max_temp', 'discharge_min_temp', 'R',
       'RUL', 'RUL_80'],
      dtype='object')

In [39]:
# (초기 용량)
derived_features_df['initial_capacity'] = derived_features_df.groupby('battery_id')['capacity'].transform('first')

# (용량 감소율)
derived_features_df['capacity_fade_rate'] = (derived_features_df['initial_capacity'] - derived_features_df['capacity']) / derived_features_df['initial_capacity']

# (방전 중 온도 변화량)
derived_features_df['discharge_temp_range'] = derived_features_df['discharge_max_temp'] - derived_features_df['discharge_min_temp']

# (충전 중 온도 변화량)
derived_features_df['charge_temp_range'] = derived_features_df['charge_max_temp'] - derived_features_df['charge_min_temp']

In [40]:
from sklearn.ensemble import RandomForestRegressor

# 1. [단계 1] 배터리(test_id)별 선형 보간 (Linear Interpolation)
# 같은 배터리 내에서 앞뒤 데이터가 있는 중간 결측치를 가장 정확하게 채웁니다.
def battery_linear_interpolation(group):
    # 사이클 순서대로 정렬
    group = group.sort_values('discharge_cnt')
    # 선형 보간 수행 (앞뒤 데이터 활용)
    group['R'] = group['R'].interpolate(method='linear', limit_direction='both')
    return group

print(f"단계 1 시작 전 결측치: {derived_features_df['R'].isnull().sum()}개")
derived_features_df = derived_features_df.groupby('test_id', group_keys=False).apply(battery_linear_interpolation)
print(f"단계 1 완료 후 결측치: {derived_features_df['R'].isnull().sum()}개")

# 2. [단계 2] 머신러닝 기반 잔여 결측치 보간
# 선형 보간으로도 안 채워진 부분(데이터가 통째로 없는 구간 등)을 처리합니다.
target_r = 'R'

# 과적합을 피하기 위해 test_id는 제외하고, 물리적 특성인 initial_capacity와 discharge_cnt를 넣습니다.
other_features = [
    'initial_capacity',      # 배터리 개별 체급(물리적 특성)
    'discharge_avg_temp', 
    'charge_avg_temp',
    'discharge_temp_range',
    'charge_temp_range'
]

# R값이 있는 데이터 (ML 학습용)
train_ml = derived_features_df.dropna(subset=[target_r] + other_features)
X_train = train_ml[other_features]
y_train = train_ml[target_r]

X_train.shape, y_train.shape

단계 1 시작 전 결측치: 163개
단계 1 완료 후 결측치: 130개


((0, 5), (0,))

In [41]:
from sklearn.ensemble import RandomForestRegressor

# 1. [단계 1] 배터리(test_id)별 선형 보간 (Linear Interpolation)
# 같은 배터리 내에서 앞뒤 데이터가 있는 중간 결측치를 가장 정확하게 채웁니다.
def battery_linear_interpolation(group):
    # 사이클 순서대로 정렬
    group = group.sort_values('discharge_cnt')
    # 선형 보간 수행 (앞뒤 데이터 활용)
    group['R'] = group['R'].interpolate(method='linear', limit_direction='both')
    return group

print(f"단계 1 시작 전 결측치: {derived_features_df['R'].isnull().sum()}개")
derived_features_df = derived_features_df.groupby('test_id', group_keys=False).apply(battery_linear_interpolation)
print(f"단계 1 완료 후 결측치: {derived_features_df['R'].isnull().sum()}개")

# 2. [단계 2] 머신러닝 기반 잔여 결측치 보간
# 선형 보간으로도 안 채워진 부분(데이터가 통째로 없는 구간 등)을 처리합니다.
target_r = 'R'

# 과적합을 피하기 위해 test_id는 제외하고, 물리적 특성인 initial_capacity와 discharge_cnt를 넣습니다.
other_features = [
    'initial_capacity',      # 배터리 개별 체급(물리적 특성)
    'discharge_avg_temp', 
    'charge_avg_temp',
    'discharge_temp_range',
    'charge_temp_range'
]

# R값이 있는 데이터 (ML 학습용)
train_ml = derived_features_df.dropna(subset=[target_r] + other_features)
X_train = train_ml[other_features]
y_train = train_ml[target_r]

X_train.shape, y_train.shape

# 여전히 R값이 없는 데이터 (ML 예측 대상)
missing_mask = derived_features_df[target_r].isnull()
X_missing = derived_features_df.loc[missing_mask, other_features]

print(f'학습 데이터 크기: {X_train.shape}')
print(f'테스트 데이터 크기: {X_missing.shape}')

if missing_mask.any():
    print(f"단계 2: ML 보간 시작 ({missing_mask.sum()}개)")
    
    # 모델 정의 (튜터 조언대로 test_id 없이 학습)
    r_model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    r_model.fit(X_train, y_train)
     
    # 예측 및 채우기 
    predicted_values = r_model.predict(X_missing)
    derived_features_df.loc[missing_mask, target_r] = predicted_values
    print("단계 2 완료: 모든 결측치가 처리되었습니다.")
else:
    print("단계 2 불필요: 모든 결측치가 선형 보간으로 해결되었습니다.")

# 3. 결과 확인 (최종 결측치 체크)
print(f"최종 'R' 결측치 개수: {derived_features_df['R'].isnull().sum()}개")

단계 1 시작 전 결측치: 130개
단계 1 완료 후 결측치: 130개
학습 데이터 크기: (0, 5)
테스트 데이터 크기: (130, 5)
단계 2: ML 보간 시작 (130개)


ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required by RandomForestRegressor.

[5] 전처리 파일 저장

In [None]:
derived_features_df.to_csv('../raw 원본데이터/전처리 완료 폴더/processing_df.csv', index=True)
print('--전처리 완료--')