In [17]:
import numpy as np
import pandas as pd
import os
import torch

In [18]:
csv_list = os.listdir("train")

df = pd.DataFrame()

for file in csv_list:
    if file.endswith(".csv"):
        df_temp = pd.read_csv(f"train/{file}")
        df = pd.concat([df, df_temp])
        
df

KeyboardInterrupt: 

In [3]:
from sklearn.preprocessing import StandardScaler
import category_encoders as ce


# 指定要標準化的欄位
columns_to_standardize = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)']

# 初始化 StandardScaler
scaler = StandardScaler()

# 對指定欄位進行標準化
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df['DateTime'] = pd.to_datetime(df['DateTime'])

In [4]:
def mean_10min(df):
    location = df["LocationCode"].unique()
    new_df = pd.DataFrame()
    
    for l in location:
        l_df = df[df["LocationCode"] == l]
        l_df.set_index('DateTime', inplace=True)
        l_df = l_df.resample('10min').mean().dropna()  # 將 '10T' 改為 '10min'
        l_df = l_df.reset_index()
        l_df["LocationCode"] = l  # 添加 LocationCode 列
        #l_df = l_df.drop(columns=["DateTime", "LocationCode"])  # 如果不需要 DateTime 列的話可以刪除
        
        # 將 l_df 與 new_df 合併
        new_df = pd.concat([new_df, l_df], ignore_index=True)
    
    return new_df

df = mean_10min(df)

In [5]:
data = np.array(df.drop(columns=['DateTime', 'Power(mW)', 'LocationCode']))
label = np.array(df['Power(mW)'])

data.shape, label.shape

((131755, 5), (131755,))

In [6]:
def score_validate(predicts, labels):
    return abs(predicts - labels).sum()



In [7]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

# 1. 加載數據
# 假設數據已經加載到 x_train 和 y_train 變量中
# x_train：訓練特徵數據, y_train：訓練標籤數據

# 2. 分割數據集
x_train, x_val, y_train, y_val = train_test_split(data, label, test_size=0.1, random_state=42)

# 3. 初始化 XGBoost 回歸模型
model = xgb.XGBRegressor(
    objective='reg:squarederror', # 回歸目標，使用均方誤差（MSE）作為損失
    eval_metric='rmse',           # 評估指標，這裡使用 RMSE
    n_estimators=1000,             # 樹的數量
    learning_rate=0.01,            # 學習率
    max_depth=10,                  # 樹的最大深度
    subsample=0.8,                # 每棵樹使用的子樣本比例
    colsample_bytree=0.8,         # 每棵樹使用的特徵子樣本比例
    random_state=42               # 固定隨機種子
)

# 4. 訓練模型
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=10, verbose=False)

# 5. 預測
y_pred = model.predict(x_val)

# 6. 評估模型
rmse = root_mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")
print(f"score:{score_validate(y_pred, y_val)}")



RMSE: 103.52556576428091
R^2 Score: 0.9495557165193163
score:525299.8624599935


In [16]:
y_pred = model.predict(np.array(x_val))
print(abs(y_pred - y_val).sum() / len(x_val))

39.86793127352713


In [9]:
max(df['Power(mW)'])

2272.618

In [10]:
data_num = 0
for i in range(len(x_val)):
    data_num += len(x_val[i])
data_num

65880