In [1]:
import numpy as np
import pandas as pd
import os
import torch

In [2]:
csv_list = os.listdir("train")

df = pd.DataFrame()

for file in csv_list:
    if file.endswith(".csv"):
        df_temp = pd.read_csv(f"train/{file}")
        df = pd.concat([df, df_temp])
        
df

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,10,2024-03-01 17:14:06.000,0.0,1017.48,15.59,94.30,652.92,0.12
1,10,2024-03-01 17:14:47.000,0.0,1017.48,15.66,94.04,682.50,0.12
2,10,2024-03-01 17:15:47.000,0.0,1017.47,15.74,94.10,750.00,0.14
3,10,2024-03-01 17:16:47.000,0.0,1017.46,15.78,94.09,738.33,0.14
4,10,2024-03-01 17:17:47.000,0.0,1017.49,15.80,94.08,660.83,0.12
...,...,...,...,...,...,...,...,...
89607,9,2024-07-23 15:50:57.000,0.0,994.54,30.69,72.91,2288.33,1.10
89608,9,2024-07-23 15:51:57.000,0.0,994.40,30.27,73.16,3236.67,1.92
89609,9,2024-07-23 15:52:57.000,0.0,994.39,29.90,72.51,4526.67,3.57
89610,9,2024-07-23 15:53:57.000,0.0,994.40,29.38,73.23,4231.67,3.13


In [3]:
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

encoder = ce.LeaveOneOutEncoder(cols=["LocationCode"], sigma = 0.05)
encoder.fit(df, df['Power(mW)'])
df = encoder.transform(df)

# 指定要標準化的欄位
columns_to_standardize = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'LocationCode']

# 初始化 StandardScaler
scaler = StandardScaler()

# 對指定欄位進行標準化
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

In [4]:
df

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,-0.754370,2024-03-01 17:14:06.000,-0.420823,0.487473,-1.661978,1.003364,-0.714856,0.12
1,-0.754370,2024-03-01 17:14:47.000,-0.420823,0.487473,-1.654168,0.992914,-0.713834,0.12
2,-0.754370,2024-03-01 17:15:47.000,-0.420823,0.486877,-1.645242,0.995326,-0.711501,0.14
3,-0.754370,2024-03-01 17:16:47.000,-0.420823,0.486282,-1.640779,0.994924,-0.711904,0.14
4,-0.754370,2024-03-01 17:17:47.000,-0.420823,0.488069,-1.638547,0.994522,-0.714583,0.12
...,...,...,...,...,...,...,...,...
89607,-1.254433,2024-07-23 15:50:57.000,-0.420823,-0.879118,0.022800,0.143706,-0.658330,1.10
89608,-1.254433,2024-07-23 15:51:57.000,-0.420823,-0.887458,-0.024061,0.153753,-0.625551,1.92
89609,-1.254433,2024-07-23 15:52:57.000,-0.420823,-0.888054,-0.065344,0.127630,-0.580963,3.57
89610,-1.254433,2024-07-23 15:53:57.000,-0.420823,-0.887458,-0.123362,0.156566,-0.591160,3.13


In [6]:
data = np.array(df.drop(columns=['DateTime', 'Power(mW)', 'LocationCode']))
label = np.array(df['Power(mW)'])

data.shape, label.shape

((1290894, 5), (1290894,))

In [7]:
def score_validate(predicts, labels):
    return abs(predicts - labels).sum()



In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

# 1. 加載數據
# 假設數據已經加載到 x_train 和 y_train 變量中
# x_train：訓練特徵數據, y_train：訓練標籤數據

# 2. 分割數據集
x_train, x_val, y_train, y_val = train_test_split(data, label, test_size=0.1, random_state=42)

# 3. 初始化 XGBoost 回歸模型
model = xgb.XGBRegressor(
    objective='reg:squarederror', # 回歸目標，使用均方誤差（MSE）作為損失
    eval_metric='rmse',           # 評估指標，這裡使用 RMSE
    n_estimators=1000,             # 樹的數量
    learning_rate=0.01,            # 學習率
    max_depth=10,                  # 樹的最大深度
    subsample=0.8,                # 每棵樹使用的子樣本比例
    colsample_bytree=0.8,         # 每棵樹使用的特徵子樣本比例
    random_state=42               # 固定隨機種子
)

# 4. 訓練模型
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=10, verbose=False)

# 5. 預測
y_pred = model.predict(x_val)

# 6. 評估模型
rmse = root_mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")
print(f"score:{score_validate(y_pred, y_val)}")



RMSE: 105.84182578247626
R^2 Score: 0.9521420159066575
score:5035754.196297436


In [13]:
y_pred = model.predict(np.array(data[:10]))
print(y_pred)
print(label[:10])

[-0.01370689 -0.01199768 -0.01019493 -0.01199768 -0.01199768 -0.01528462
  0.19650765 -0.0126178  -0.01370689 -0.01370689]
[0.12 0.12 0.14 0.14 0.12 0.1  0.07 0.06 0.07 0.06]


In [14]:
max(df['Power(mW)'])

2626.48

In [15]:
data_num = 0
for i in range(len(x_val)):
    data_num += len(x_val[i])
data_num

645450