## **0. Library**

In [1]:
import os
import sys
import warnings
import pandas as pd
import numpy as np
from random import randrange
from datetime import date, time, datetime
import math
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

In [2]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

## **1. Data**

In [3]:
train_path = './Dataset/data/raw_data/train/'
test_path = './Dataset/data/raw_data/test/'

train_df = pd.DataFrame()
test_df = pd.DataFrame()

for file in os.listdir(train_path):
    df = pd.read_csv(train_path + file)
    if 'dchg' in file:
        df['chg'] = 'dchg' # 방전중
    else:
        df['chg'] = 'chg' # 충전중
    train_df = pd.concat([train_df, df])

for file in os.listdir(test_path):
    df = pd.read_csv(test_path + file)
    if 'dchg' in file:
        df['chg'] = 'dchg' # 방전중
    else:
        df['chg'] = 'chg' # 충전중
    if 'OK' in file:
        df['fault'] = 'OK' # 양품
    else:
        df['fault'] = 'NG' # 불량품
    test_df = pd.concat([test_df, df])

In [4]:
train_df.shape, test_df.shape

((482639, 232), (23392, 233))

In [5]:
train_df.head(3)

Unnamed: 0,Date,Time,SerialNumber,Voltage,Current,RSOCmin,RSOCmax,RSOCavg,USOCmin,USOCmax,...,M12T02,M13T01,M13T02,M14T01,M14T02,M15T01,M15T02,M16T01,M16T02,chg
0,2020-08-27,16:58:25,1013.0,712.6,0.0,89.02,89.71,89.56,99.0,100.0,...,33.9,34.0,34.2,33.8,33.9,34.1,34.2,33.5,33.6,dchg
1,2020-08-27,16:58:26,1013.0,712.6,0.0,89.02,89.71,89.56,99.0,100.0,...,33.9,34.0,34.2,33.8,33.9,34.1,34.2,33.5,33.6,dchg
2,2020-08-27,16:58:27,1013.0,712.6,0.0,89.02,89.71,89.56,99.0,100.0,...,33.9,34.0,34.2,33.7,33.9,34.1,34.2,33.5,33.6,dchg


In [6]:
test_df.head(3)

Unnamed: 0,Date,Time,SerialNumber,Voltage,Current,RSOCmin,RSOCmax,RSOCavg,USOCmin,USOCmax,...,M13T01,M13T02,M14T01,M14T02,M15T01,M15T02,M16T01,M16T02,chg,fault
0,2021-09-01,10:18:23,578,0,0,0,0,0,0,0,...,24.4,24.6,24.8,24.7,24.4,24.4,24.8,24.8,chg,NG
1,2021-09-01,10:18:24,578,0,0,0,0,0,0,0,...,24.4,24.6,24.8,24.7,24.4,24.4,24.8,24.8,chg,NG
2,2021-09-01,10:18:25,578,0,0,0,0,0,0,0,...,24.4,24.6,24.8,24.7,24.5,24.4,24.8,24.8,chg,NG


## **2. Preprocessing**

#### **2.1 Data Types**

In [7]:
train_df.dtypes

Date             object
Time             object
SerialNumber    float64
Voltage         float64
Current         float64
                 ...   
M15T01          float64
M15T02          float64
M16T01          float64
M16T02          float64
chg              object
Length: 232, dtype: object

In [8]:
test_df.dtypes

Date             object
Time             object
SerialNumber      int64
Voltage           int64
Current           int64
                 ...   
M15T02          float64
M16T01          float64
M16T02          float64
chg              object
fault            object
Length: 233, dtype: object

In [9]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Time'] = pd.to_datetime(train_df['Time'])

test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df['Time'] = pd.to_datetime(test_df['Time'])

In [10]:
train_df.dtypes

Date            datetime64[ns]
Time            datetime64[ns]
SerialNumber           float64
Voltage                float64
Current                float64
                     ...      
M15T01                 float64
M15T02                 float64
M16T01                 float64
M16T02                 float64
chg                     object
Length: 232, dtype: object

In [11]:
test_df.dtypes

Date            datetime64[ns]
Time            datetime64[ns]
SerialNumber             int64
Voltage                  int64
Current                  int64
                     ...      
M15T02                 float64
M16T01                 float64
M16T02                 float64
chg                     object
fault                   object
Length: 233, dtype: object

#### **2.2 Missing Values**

- `train_df`에는 1691~1695개의 결측치가 있음
    - 그러나 결측치가 있는 행은 전체 데이터의 0.3%이므로 제거

In [12]:
print(max(train_df.isnull().sum()) / train_df.shape[0])

0.003511941637538616


In [13]:
train_df.isnull().sum()

Date            1691
Time            1691
SerialNumber    1691
Voltage         1690
Current         1691
                ... 
M15T01          1695
M15T02          1695
M16T01          1695
M16T02          1695
chg                0
Length: 232, dtype: int64

In [14]:
train_df.dropna(axis=0, inplace=True)
print(sum(test_df.isnull().sum() > 0))

0


- `test_df`에는 결측치가 없음

In [15]:
sum(test_df.isnull().sum() > 0)

0

#### **2.3 Duplicates**

- `train_df`와 `test_df` 모두 중복 값이 존재하지 않음

In [16]:
print(train_df[train_df.duplicated()].shape[0])
print(test_df[test_df.duplicated()].shape[0])

0
0


#### **2.4 Outliers**

- IQR을 이용하여 이상치 제거

In [17]:
def remove_outliers(df, c):
    col_num = df.select_dtypes(include='number').columns.to_list()
    df_num = df.loc[:, col_num]
    
    Q1 = df_num.quantile(0.25)
    Q3 = df_num.quantile(0.75)
    IQR = Q3 - Q1

    outlier_idx = ((df_num.lt(Q1 - c * IQR)) | (df_num.gt(Q3 + c * IQR))).any(axis=1)
    
    return df[~outlier_idx]

In [18]:
print('전:', train_df.shape)

train_df = remove_outliers(train_df, 4)
print('후:', train_df.shape)

전: (480944, 232)
후: (436295, 232)
