In [1]:
import numpy as np
import pandas as pd
import os
import torch
from utils.csv_to_pd import *

In [2]:
df = read_dir_csv()

location_ori = list(df["LocationCode"]) 
df[:1]

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,10,2024-03-01 17:14:06.000,0.0,1017.48,15.59,94.3,652.92,0.12


In [3]:
df = mean_10min(df)
df[:1]


Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,10.0,0.124286,1017.49,15.712857,93.771429,652.797143,0.115714,17


In [4]:
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

encoder = ce.LeaveOneOutEncoder(cols=["LocationCode", "hour"], sigma = 0.05)
encoder.fit(df, df['Power(mW)'])
df = encoder.transform(df)

df[:1]


Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,171.555961,0.124286,1017.49,15.712857,93.771429,652.797143,0.115714,2.273539


In [5]:
# 指定要標準化的欄位
columns_to_standardize = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', "LocationCode", "hour"]

# 初始化 StandardScaler
scaler = StandardScaler()

# 對指定欄位進行標準化
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df[:1]

Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,-0.757742,-0.284717,0.741951,-1.640724,0.974532,-0.721713,0.115714,-1.015733


In [6]:
data_list, label_list, _ = spilt_data_with_datetime(df, location_ori)

data_list[0].shape, label_list[0].shape

(torch.Size([6, 5]), torch.Size([6]))

In [7]:
def slice_seq_to_same_length(datas: list, labels: list, length: int):
    train_data = []
    train_label = []
    test_data = []
    test_label = []

    for i in range(len(datas)):
        data = datas[i]
        label = labels[i]
        start = 0
        end = len(data)
        while end - start >= length:
            train_data.append(data[start:start + length])
            train_label.append(label[start:start + length])
            start += length
        if start != end:
            test_data.append(data[start:start + length])
            test_label.append(label[start:start + length])
    return np.array(train_data), np.array(train_label), np.array(test_data, dtype=object), np.array(test_label, dtype=object)


train_data, train_label, test_data, test_label = slice_seq_to_same_length(data_list, label_list, 5)

for data in train_data:
    assert len(data) == 5

train_data.shape, train_label.shape
    

((25430, 5, 5), (25430, 5))

In [8]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import nn

In [9]:
class Normalizer():
    def __init__(self, labels):
        self.max_val = labels.max()
        self.min_val = labels.min()
    def normalize(self, target):
        return (target - self.min_val) / (self.max_val - self.min_val)
    def denormalize(self, target):
        return target * (self.max_val - self.min_val) + self.min_val
    
label_normalizer = Normalizer(train_label)

In [10]:
#train_labels_norm = label_normalizer.normalize(train_label)


# 將數據轉換為 PyTorch 張量
train_data_tensor = torch.tensor(train_data, dtype=torch.float32)
train_label_tensor = torch.tensor(train_label, dtype=torch.float32)



# 建立 TensorDataset
dataset = TensorDataset(train_data_tensor, train_label_tensor)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# 建立 DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# 檢查形狀
for batch_data, batch_labels in train_loader:
    print(f"Batch data shape: {batch_data.shape}, Batch labels shape: {batch_labels.shape}")
    break

Batch data shape: torch.Size([64, 5, 5]), Batch labels shape: torch.Size([64, 5])


In [11]:
# 將數據轉換為 PyTorch 張量列表，保留每個序列的不同長度
test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]
test_label_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_label]

# 建立自定義 Dataset 用於處理不同長度的序列
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 建立 Test Dataset 和 DataLoader
test_dataset = TestDataset(test_data_tensors, test_label_tensors)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# 檢查測試資料加載情況
for data, label in test_loader:
    print(f"Data shape: {data[0].shape}, Label shape: {label[0].shape}")
    break

Data shape: torch.Size([1, 5]), Label shape: torch.Size([1])


  test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]
  test_label_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_label]


In [12]:
class LSTMTagger(nn.Module):

    def __init__(self, hidden_dim, tagset_size, input_dim=6):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # LSTM層，輸入維度為 input_dim，輸出維度為 hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)

        # 線性層將 LSTM 的輸出映射到標籤空間
        self.linear = nn.Linear(hidden_dim, tagset_size)
        self.relu = nn.ReLU()

    def init_hidden(self, batch_size):
        # 初始化隱藏狀態和細胞狀態
        return (torch.zeros(1, batch_size, self.hidden_dim),
                torch.zeros(1, batch_size, self.hidden_dim))

    def forward(self, sentence, hidden):
        # sentence 的形狀為 (batch_size, seq_len, input_dim)
        # LSTM 層的輸出 lstm_out 形狀為 (batch_size, seq_len, hidden_dim)
        # 並傳回更新後的隱藏狀態
        lstm_out, hidden = self.lstm(sentence, hidden)

        # 使用線性層將 LSTM 的輸出映射到標籤空間
        tag_space = self.relu(self.linear(self.relu(lstm_out)))

        # tag_space 的形狀為 (batch_size, seq_len, tagset_size)
        return tag_space, hidden

In [13]:
import torch
import torch.optim as optim
import torch.nn as nn

# 設置 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定義訓練函數
def train_model(model, train_loader, valid_loader, num_epochs=10, learning_rate=0.001):
    # 將模型移到 GPU
    model = model.to(device, dtype=torch.float32)
    # 使用 Adam 優化器
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # 定義損失函數
    criterion = nn.SmoothL1Loss()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for inputs, labels in train_loader:
            # 將輸入和標籤移到 GPU
            inputs, labels = inputs.to(device, dtype=torch.float32), labels.to(device, dtype=torch.float32)
            
            # 初始化隱藏狀態
            hidden = model.init_hidden(batch_size=inputs.size(0))
            hidden = tuple([h.to(device, dtype=torch.float32) for h in hidden])

            # 清零梯度
            optimizer.zero_grad()
            # 前向傳播
            outputs, _ = model(inputs, hidden)
            # 計算損失
            loss = criterion(outputs.squeeze(), labels)
            # 反向傳播
            loss.backward()
            # 更新參數
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss / len(train_loader):.4f}")

        # 驗證模型
        valid_loss = validate_model(model, valid_loader, criterion)

# 定義驗證函數
def validate_model(model, valid_loader, criterion):
    model.eval()
    total_loss = 0.0
    error = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            # 將輸入和標籤移到 GPU
            inputs, labels = inputs.to(device, dtype=torch.float32), labels.to(device, dtype=torch.float32)
            
            # 初始化隱藏狀態
            hidden = model.init_hidden(batch_size=inputs.size(0))
            hidden = tuple([h.to(device, dtype=torch.float32) for h in hidden])

            # 前向傳播
            outputs, _ = model(inputs, hidden)
            # 計算損失
            loss = criterion(outputs.squeeze(), labels.squeeze())
            error += abs(outputs.view(-1) - labels.view(-1)).sum() / inputs.shape[0] / inputs.shape[1]
            total_loss += loss.item()

    print(f"Validation Loss: {total_loss / len(valid_loader):.4f}, valid error: {error / len(valid_loader)}")
    return total_loss


In [14]:
model = LSTMTagger(256, 1, input_dim=5)
train_model(model, train_loader, valid_loader, 100)


Epoch [1/100], Training Loss: 237.3285
Validation Loss: 241.3260, valid error: 241.7781219482422
Epoch [2/100], Training Loss: 217.3776
Validation Loss: 222.9365, valid error: 223.3627166748047
Epoch [3/100], Training Loss: 200.5915
Validation Loss: 205.9902, valid error: 206.409423828125
Epoch [4/100], Training Loss: 185.4003
Validation Loss: 190.6702, valid error: 191.07589721679688
Epoch [5/100], Training Loss: 171.6484
Validation Loss: 177.1827, valid error: 177.5955047607422
Epoch [6/100], Training Loss: 158.5405
Validation Loss: 164.1757, valid error: 164.57699584960938
Epoch [7/100], Training Loss: 147.5157
Validation Loss: 152.6951, valid error: 153.0974578857422
Epoch [8/100], Training Loss: 137.9062
Validation Loss: 142.4004, valid error: 142.7978973388672
Epoch [9/100], Training Loss: 128.9356
Validation Loss: 133.8718, valid error: 134.2698211669922
Epoch [10/100], Training Loss: 120.8400
Validation Loss: 125.1940, valid error: 125.6009750366211
Epoch [11/100], Training Los

In [15]:
criterion = nn.SmoothL1Loss()
model = model.to(device='cuda', dtype=torch.float32)
validate_model(model, test_loader, criterion=criterion)

Validation Loss: 10.2131, valid error: 10.328842163085938


17178.369089022275

In [16]:
model = model.to(device='cpu')
test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]

for x, y in valid_loader:

    with torch.no_grad():
        hidden = model.init_hidden(batch_size=x.size(0))
        predict, _ = model(x, hidden)

        predict_print = (torch.round(predict * 100) / 100).numpy()
        #predict_print = label_normalizer.denormalize(predict_print)
        y_print = (torch.round(y * 100) / 100).numpy()
        np.set_printoptions(precision=2, suppress=True)
        if max(y_print[0]) > 30:

            print("predict:", predict_print[0, :, 0])
            print("label:", y_print[0])
            print()


predict: [1101.65  626.87  438.93 1171.25  590.27]
label: [1288.97  696.1   399.48 1275.61  528.87]

predict: [ 39.8  106.93 149.33 256.61 159.6 ]
label: [ 48.83 112.13 149.18 318.05 214.01]

predict: [1115.39  153.37  113.45   28.25   35.57]
label: [841.89 101.54  64.7   24.85  36.88]

predict: [636.87 652.21 562.96 415.64 246.67]
label: [552.09 552.74 494.1  314.93 266.18]

predict: [ 624.93  242.37  202.77  323.08 1331.61]
label: [461.12 220.2  187.51 263.88 957.74]

predict: [120.4  117.68  71.53  63.51  53.67]
label: [ 96.05 106.62  83.32  79.4   57.7 ]

predict: [ 34.19  83.63 148.45 223.44 315.27]
label: [ 50.01  92.89 152.86 221.   306.78]

predict: [316.39 138.42 109.85  83.82  81.86]
label: [311.28 150.95 102.42  89.59  88.32]

predict: [  95.    409.97 1176.54  370.45  781.39]
label: [ 82.86 342.83 939.37 201.74 650.06]

predict: [167.9   75.91  67.45  95.33  87.56]
label: [40.1  18.86 23.4  28.54 24.39]

predict: [1009.05 1054.55 1125.13 1030.5  1025.2 ]
label: [1150.06 112

  test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]
