## 清理訓練數據集

In [1]:
import pandas
data_url = 'https://raw.githubusercontent.com/ywchiu/HKPC/master/data/train.csv'

df_train = pandas.read_csv(data_url)

In [2]:
# 建立 Time-To-Failure 標籤
df_train['ttf'] = df_train['last_cycle'] - df_train['cycle']

# 建立引擎是否在30 個循環內故障的標籤
df_train['label_bnc'] = df_train['ttf'].apply(lambda x: 1 if x <= 30 else 0)

# 建立引擎是否在15 ~30 個循環內故障或15 個循環內故障的標籤
df_train['label_mcc'] = df_train['ttf'].apply(lambda x: 2 if x <= 30/2 else 1 if x <= 30 else 0)

In [3]:
df_train['cycle'].describe()

count    20631.000000
mean       108.807862
std         68.880990
min          1.000000
25%         52.000000
50%        104.000000
75%        156.000000
max        362.000000
Name: cycle, dtype: float64

In [4]:
# 檢查cycle 有沒有缺失值
df_train['cycle'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
20626    False
20627    False
20628    False
20629    False
20630    False
Name: cycle, Length: 20631, dtype: bool

In [5]:
# 檢查cycle 缺失值的總數
df_train['cycle'].isna().sum()

0

In [6]:
# 檢查DataFrame 各欄位的缺失值數
df_train.isna().sum()

id            0
cycle         0
setting1      0
setting2      0
setting3      0
s1            0
s2            0
s3            0
s4            0
s5            0
s6            0
s7            0
s8            0
s9            0
s10           0
s11           0
s12           0
s13           0
s14           0
s15           0
s16           0
s17           0
s18           0
s19           0
s20           0
s21           0
last_cycle    0
ttf           0
label_bnc     0
label_mcc     0
dtype: int64

In [7]:
df_train['s12'].rolling(window = 5).mean()

0            NaN
1            NaN
2            NaN
3            NaN
4        522.282
          ...   
20626    519.890
20627    519.708
20628    519.688
20629    519.712
20630    519.630
Name: s12, Length: 20631, dtype: float64

In [8]:
df_train['s12'].rolling(window = 5).std()

0             NaN
1             NaN
2             NaN
3             NaN
4        0.432574
           ...   
20626    0.460000
20627    0.242322
20628    0.201544
20629    0.187670
20630    0.263154
Name: s12, Length: 20631, dtype: float64

In [9]:
def add_features(df_in, rolling_win_size):
    sensor_cols = ['s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']
    
    sensor_av_cols = [nm.replace('s', 'av') for nm in sensor_cols]
    sensor_sd_cols = [nm.replace('s', 'sd') for nm in sensor_cols]
    
    df_out = pandas.DataFrame()
    
    ws = rolling_win_size
    
    for m_id in pandas.unique(df_in.id):    
        df_engine = df_in[df_in['id'] == m_id]
        df_sub = df_engine[sensor_cols]

        # get rolling mean 
        av = df_sub.rolling(ws, min_periods=1).mean()
        av.columns = sensor_av_cols
    
        # get the rolling std
        sd = df_sub.rolling(ws, min_periods=1).std().fillna(0)
        sd.columns = sensor_sd_cols
    
        new_ftrs = pandas.concat([df_engine,av,sd], axis=1)
    
        df_out = pandas.concat([df_out,new_ftrs])
        
    return df_out

In [10]:
df_train_ex = add_features(df_train, 5)

In [11]:
df_train_ex.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,sd12,sd13,sd14,sd15,sd16,sd17,sd18,sd19,sd20,sd21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,0.438406,0.035355,5.041671,0.008697,0.0,0.0,0.0,0.0,0.042426,0.003253
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,0.404475,0.026458,3.71745,0.00764,0.0,1.154701,0.0,0.0,0.055076,0.044573
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,0.49595,0.029439,3.050906,0.028117,0.0,1.0,0.0,0.0,0.076322,0.037977
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,0.432574,0.025884,2.651326,0.025953,0.0,1.095445,0.0,0.0,0.073621,0.033498


In [12]:
df_train_ex.to_csv('data/pdm_train.csv', index =None)

## 清理測試數據集

In [13]:
import pandas
data_url = 'https://raw.githubusercontent.com/ywchiu/HKPC/master/data/test.csv'

df_test = pandas.read_csv(data_url)

In [14]:
# 建立 Time-To-Failure 標籤
df_test['ttf'] = df_test['last_cycle'] - df_test['cycle']

# 建立引擎是否在30 個循環內故障的標籤
df_test['label_bnc'] = df_test['ttf'].apply(lambda x: 1 if x <= 30 else 0)

# 建立引擎是否在15 ~30 個循環內故障或15 個循環內故障的標籤
df_test['label_mcc'] = df_test['ttf'].apply(lambda x: 2 if x <= 30/2 else 1 if x <= 30 else 0)

## 小任務

請仿照準備訓練數據集的方式檢視數據

- 使用 describe 檢視測試數據集 cycle 的敘述性統計
- 使用 isna 檢視測試數據集的 cycle 缺失值 (Missing Value) 的總數
- 檢查df_tst DataFrame 各欄位的缺失值數
- 使用 add_features函數 幫 df_test 增加區間5 的平均值與標準差，並將結果存到 df_test_ex 中
- 請將 df_test_ex 存至 pdf_test.csv 中

In [1]:
# 使用 describe 檢視測試數據集 cycle 的敘述性統計

In [2]:
# 使用 isna 檢視測試數據集的 cycle 缺失值 (Missing Value) 的總數

In [3]:
# 檢查df_tst DataFrame 各欄位的缺失值數

In [19]:
# 使用 add_features函數 幫 df_test 增加區間5 的平均值與標準差，並將結果存到 df_test_ex 中

In [21]:
# 請將 df_test_ex 存至 pdf_test.csv 中