In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance

from sklearn.metrics import mean_squared_error

In [3]:
## 獲取當前工作路徑
current_directory = os.getcwd()

## 步驟
1. 將所有特徵一起丟到模型中訓練
2. 刪掉比較不重要的特徵，再次訓練模型，並進行目標時間段的預測


### **1. 將所有特徵一起丟到模型中訓練**

In [76]:
def train(oven_id, p):

    print('oven_id:', oven_id)
    ## 載入資料
    input_path = os.path.join(current_directory, "data/anomaly_with_power_cooler/", f"{oven_id}.csv")
    df_data = pd.read_csv(input_path)
    df_data = df_data.loc[:, ~df_data.columns.str.startswith('Unnamed:')]
    df_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

    # 時間型態轉變
    df_data['date'] = pd.to_datetime(df_data['date'])

    # 創建一個新列，表示數據集類型（訓練集或測試集）
    if p == 1:
        df_data['dataset_type'] = df_data['date'].apply(lambda x: '訓練集' if x <= datetime(2022, 7, 31) else '測試集')
    else:
        df_data['dataset_type'] = df_data['date'].apply(lambda x: '訓練集' if x <= datetime(2022, 12, 31) else '測試集')
    # 根據數據集類型拆分數據
    train_data = df_data[df_data['dataset_type'] == '訓練集']
    test_data = df_data[df_data['dataset_type'] == '測試集']

    ## 切割訓練集與測試集
    df_data_train = train_data.drop(labels=['dataset_type', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
    df_data_test = test_data.drop(labels=['dataset_type', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)

    X_train = df_data_train.values
    y_train = train_data['anomaly_total_number'].values

    X_test = df_data_test.values
    y_test = test_data['anomaly_total_number'].values


    ## XGBoost

    # 建立XGBClassifier模型
    xgboostModel = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100)

    # # 創建KFold交叉驗證對象，指定k的值（這裡為5）
    # kfold = KFold(n_splits=5, shuffle=True, random_state=57)

    # 創建一個空列表來存儲每個折疊的性能評估
    mse_scores = []

    # 訓練XGBoost模型
    xgboostModel.fit(X_train, y_train)

    # 在驗證集上進行預測
    y_pred = xgboostModel.predict(X_test)
    y_pred = [round(i) for i in y_pred]  # 四捨五入

    # 計算均方根誤差（RMSE）作為性能指標
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

    # 輸出平均RMSE和標準差
    print(f"Mean RMSE: {np.mean(np.sqrt(mse_scores)):.2f}")

    # 特徵重要重程度
    feature_names = df_data_train.columns
    feature_importances = xgboostModel.feature_importances_

    print(f'\n{"特徵名稱：":35} {"重要程度: ":10}')
    for feature_name, importance in zip(feature_names, feature_importances):
        print(f'{feature_name:35} {importance:10}')

    print('----------------------------------------------------')

    # 合成test&prediction
    test_data.reset_index(inplace=True)
    df_temp = pd.DataFrame({'Answer': y_test, 'Prediction': y_pred, "Gap": y_test-y_pred})
    df_concat = pd.concat([test_data[['date', 'layer_id', 'anomaly_accumulation_hour']], df_temp], axis=1)
    return df_concat

In [77]:
oven_id_lst_p1 = ['1B0', '1C0', '1D0', '1E0', '1G0']
oven_id_lst_p2 = ['2B0', '2C0', '2D0', '2E0', '2G0']

In [78]:
df_dic = {}

In [79]:
for oven_id in oven_id_lst_p1[:-1]:
    df_dic[oven_id] = train(oven_id, 1)

for oven_id in oven_id_lst_p2[:-2]:
    df_dic[oven_id] = train(oven_id, 2)

# 拿八月答案跟八月的預測做比較

result_dic = {"爐": [],
              "8月總故障燈管數": [],
              "8月預測加總結果": [],
              "預測誤差": []}

for key in df_dic:
    result_dic["爐"].append(key)
    ans_sum = df_dic[key]['Answer'].sum()
    pre_sum = df_dic[key]['Prediction'].sum()
    gap_sum = df_dic[key]['Gap'].sum()

    result_dic["8月總故障燈管數"].append(ans_sum)
    result_dic["8月預測加總結果"].append(pre_sum)
    result_dic["預測誤差"].append(gap_sum)

result_df_previous = pd.DataFrame(result_dic)
result_df_previous

oven_id: 1B0
Mean RMSE: 1.00

特徵名稱：                               重要程度:     
layer_id                            0.2670462131500244
anomaly_accumulation_hour           0.08356472849845886
water_volumn                        0.35383233428001404
Temperature_A                       0.13029742240905762
Temperature_B                       0.03656674548983574
power_count                         0.12869250774383545
----------------------------------------------------
oven_id: 1C0
Mean RMSE: 0.67

特徵名稱：                               重要程度:     
layer_id                            0.08937305957078934
anomaly_accumulation_hour           0.10693299770355225
water_volumn                        0.2826194167137146
Temperature_A                       0.06072842702269554
Temperature_B                       0.37577804923057556
power_count                         0.08456801623106003
----------------------------------------------------
oven_id: 1D0
Mean RMSE: 0.91

特徵名稱：                               重要程度

Mean RMSE: 0.53

特徵名稱：                               重要程度:     
layer_id                            0.11034394800662994
anomaly_accumulation_hour           0.19826868176460266
water_volumn                               0.0
Temperature_A                       0.07785608619451523
Temperature_B                       0.3276831805706024
power_count                         0.28584805130958557
----------------------------------------------------
oven_id: 2B0
Mean RMSE: 0.83

特徵名稱：                               重要程度:     
layer_id                            0.07962945103645325
anomaly_accumulation_hour           0.12546700239181519
water_volumn                               0.0
Temperature_A                       0.2794518768787384
Temperature_B                       0.33940181136131287
power_count                         0.1760498285293579
----------------------------------------------------
oven_id: 2C0
Mean RMSE: 0.60

特徵名稱：                               重要程度:     
layer_id                 

Unnamed: 0,爐,8月總故障燈管數,8月預測加總結果,預測誤差
0,1B0,64,55,9
1,1C0,41,37,4
2,1D0,47,50,-3
3,1E0,42,40,2
4,2B0,90,78,12
5,2C0,79,81,-2
6,2D0,80,92,-12


### **2. 刪掉比較不重要的特徵，訓練模型，進行目標時間段的預測**

利用上方的特徵重要程度篩選各爐模型要使用的特徵  

---

ps. 因為1G0、2G0數據不足，因此我們根據敘述統計的箱型圖找出分布和1G0、2G0較相似的爐(1E0)，將三個爐的數據合在一起訓練一個模型

### 1B0 模型

In [80]:
oven_id = '1B0'
print('oven_id:', oven_id)

input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df_B = pd.read_csv(input_path_ninth)
df_B = df_B.loc[:, ~df_B.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'accumulation_hour', 'water_volum', 'power', 'B_temperature']

# 創建新的DataFrame
test_data = df_B[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['Temperature_A', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data


X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost
xgboostModel_B = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_B.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_B.predict(X_test)

print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1B0
有 88 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 1C0 模型

In [81]:
oven_id = '1C0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)

df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'accumulation_hour', 'water_volum', 'power', 'A_temperature', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomalyaccumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

## 切割訓練集與測試集

df_data_train = train_data.drop(labels=['date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data


X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost
xgboostModel_C = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_C.fit(X_train, y_train)

# 在驗證集上進行預測
y_pred = xgboostModel_C.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1C0
有 63 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 1D0 模型

In [82]:
oven_id = '1D0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)

df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['accumulation_hour', 'power', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'B_temperature': 'Temperature_B'
}, inplace=True)

## 切割訓練集與測試集

df_data_train = train_data.drop(labels=['layer_id', 'water_volumn', 'Temperature_A', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data


X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost

xgboostModel_D = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_D.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_D.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1D0
有 75 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 1E0 模型

In [83]:
oven_id = '1E0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)

df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'accumulation_hour', 'power', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomalyaccumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['water_volumn', 'Temperature_A', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost

xgboostModel_E = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)

# 訓練
xgboostModel_E.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_E.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1E0
有 69 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 2B0 模型

In [84]:
oven_id = '2B0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'accumulation_hour', 'power', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['water_volumn', 'Temperature_A', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost

xgboostModel_2B = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_2B.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_2B.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 2B0
有 79 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 2C0 模型

In [85]:
oven_id = '2C0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'accumulation_hour', 'water_volum', 'power', 'A_temperature', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hourr',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost

xgboostModel_2C = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_2C.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_2C.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 2C0
有 66 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 2D0 模型

In [86]:
oven_id = '2D0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['layer_id', 'water_volum', 'power', 'A_temperature', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['anomaly_accumulation_hour', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost
xgboostModel_2D = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_2D.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_2D.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 2D0
有 79 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 2E0

In [87]:
oven_id = '2E0'
print('oven_id:', oven_id)

## 載入資料
input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
train_data = pd.read_csv(input_path)

train_data = train_data.loc[:, ~train_data.columns.str.startswith('Unnamed:')]
train_data.drop(["oven_id", "lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# 要包含在新DataFrame中的column
selected_columns = ['accumulation_hour', 'power', 'A_temperature', 'B_temperature']

# 創建新的DataFrame
test_data = df[selected_columns]

test_data.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

df_data_train = train_data.drop(labels=['water_volumn', 'anomaly_accumulation_hour', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'],axis=1)
df_data_test = test_data

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values

## XGBoost
xgboostModel_2E = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
xgboostModel_2E.fit(X_train, y_train)

# 預測
y_pred = xgboostModel_2E.predict(X_test)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 2E0
有 61 支異常燈管


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.rename(columns={


### 1E0, 1G0, 2G0 模型

In [5]:
concat = ["1G0", "2G0", "1E0"]

In [6]:
# 將三個爐的資料合起來
df_lst = []
for oven_id in concat:
    input_path = os.path.join(current_directory, 'data/anomaly_with_power_cooler', f'{oven_id}.csv')
    df = pd.read_csv(input_path)
    df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
    df.drop(["lamp_id", 'lamp_special_rartio'], axis=1, inplace=True)

    if oven_id == '1E0' or oven_id == '2E0':
        df['oven_id'] = oven_id

    df_lst.append(df)

df_data = pd.concat(df_lst)

# 名目資料編碼
df_data = pd.get_dummies(df_data, columns=['oven_id'])

In [7]:
# 時間型態轉變
df_data['date'] = pd.to_datetime(df_data['date'])

# 創建一個新列，表示數據集類型（訓練集或測試集）
df_data['dataset_type'] = df_data['date'].apply(lambda x: '訓練集' if x <= datetime(2022, 7, 31) else '測試集')

# 根據數據集類型拆分數據
train_data = df_data[df_data['dataset_type'] == '訓練集']
test_data = df_data[df_data['dataset_type'] == '測試集']

## 切割訓練集與測試集
df_data_train = train_data.drop(labels=['dataset_type', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'], axis=1)
df_data_test = test_data.drop(labels=['dataset_type', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'], axis=1)

X_train = df_data_train.values
y_train = train_data['anomaly_total_number'].values

X_test = df_data_test.values
y_test = test_data['anomaly_total_number'].values


## XGBoost
xgboostModel = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=15)
xgboostModel.fit(X_train, y_train)

# 在驗證集上進行預測
y_pred = xgboostModel.predict(X_test)

# 計算均方根誤差（RMSE）作為性能指標
mse = mean_squared_error(y_test, y_pred)

# 輸出平均RMSE和標準差
print(f"Mean RMSE: {np.mean(np.sqrt(mse)):.2f}")

# 特徵重要重程度
feature_names = df_data_train.columns
feature_importances = xgboostModel.feature_importances_

print(f'\n{"特徵名稱：":35} {"重要程度: ":10}')
for feature_name, importance in zip(feature_names, feature_importances):
    print(f'{feature_name:35} {importance:10}')

Mean RMSE: 0.51

特徵名稱：                               重要程度:     
layer_id                            0.1292092502117157
anomaly_accumulation_hour           0.23358619213104248
Temperature_A                       0.08232778310775757
Temperature_B                       0.30545175075531006
power_count                         0.24942506849765778
oven_id_1E0                                0.0
oven_id_1G0                                0.0
oven_id_2G0                                0.0


* 經過特徵挑選後的模型RMSE並沒有變小，因此特徵不做刪減
* n_estimators=15 時 RMSE 最小

**最終模型**

In [91]:
df_data_train = df_data.drop(labels=['dataset_type', 'date', 'anomaly_total_number', 'power(other)', 'power(lamp_1_2_60_61_62_63_121_122)'], axis=1)

X_train = df_data_train.values
y_train = df_data['anomaly_total_number'].values

## 訓練模型
xgboostModel = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=15)
xgboostModel.fit(X_train, y_train)

In [92]:
oven_id = '1E0'
print('oven_id:', oven_id)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
df['oven_id_1E0'] = 1
df['oven_id_1G0'] = 0
df['oven_id_2G0'] = 0
df.drop(['oven_id', 'date', 'time_interval', 'accumulation_diff'], axis=1, inplace=True)

df.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

# 預測
y_pred = xgboostModel.predict(df.values)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1E0
有 92 支異常燈管


In [93]:
oven_id = '1G0'
print('oven_id:', oven_id)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
# df = pd.get_dummies(df, columns=['oven_id'])
df['oven_id_1E0'] = 0
df['oven_id_1G0'] = 1
df['oven_id_2G0'] = 0
df.drop(['oven_id', 'date', 'time_interval', 'accumulation_diff'], axis=1, inplace=True)

df.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

# 預測
y_pred = xgboostModel.predict(df.values)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 1G0
有 38 支異常燈管


In [94]:
oven_id = '2G0'
print('oven_id:', oven_id)

# ----------------------------------- 預測數據讀取&處理-------------------------------------------------------------------
input_path_ninth = os.path.join(current_directory, 'data/acc_hour_prediction_withpc', f'{oven_id}.csv')
df = pd.read_csv(input_path_ninth)
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

df['oven_id_1E0'] = 0
df['oven_id_2G0'] = 0
df['oven_id_2G0'] = 1
df.drop(['oven_id', 'date', 'time_interval', 'accumulation_diff'], axis=1, inplace=True)

df.rename(columns={
    'accumulation_hour': 'anomaly_accumulation_hour',
    'water_volum': 'water_volumn',
    'A_temperature': 'Temperature_A',
    'B_temperature': 'Temperature_B'
}, inplace=True)

# 預測
y_pred = xgboostModel.predict(df.values)
print('有', round(sum(y_pred)), '支異常燈管')

oven_id: 2G0


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\cherr\\OneDrive\\桌面\\初賽繳交檔案\\112094_Source\\data/acc_hour_prediction_withpc\\2G0.csv'