In [1]:
import numpy as np
import pandas as pd
import os
import torch
from utils.csv_to_pd import *

In [2]:
df = read_dir_csv()

location_ori = list(df["LocationCode"]) 
df[:1]

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,10,2024-03-01 17:14:06.000,0.0,1017.48,15.59,94.3,652.92,0.12


In [3]:
df = mean_10min(df)
df[:1]


Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,10.0,0.124286,1017.49,15.712857,93.771429,652.797143,0.115714,17


In [4]:
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

encoder = ce.LeaveOneOutEncoder(cols=["LocationCode", "hour"], sigma = 0.05)
encoder.fit(df, df['Power(mW)'])
df = encoder.transform(df)

df[:1]


Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,171.555961,0.124286,1017.49,15.712857,93.771429,652.797143,0.115714,2.273539


In [5]:
# 指定要標準化的欄位
columns_to_standardize = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', "LocationCode", "hour"]

# 初始化 StandardScaler
scaler = StandardScaler()

# 對指定欄位進行標準化
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df[:1]

Unnamed: 0,DateTime,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),hour
0,2024-03-01 17:10:00,-0.757742,-0.284717,0.741951,-1.640724,0.974532,-0.721713,0.115714,-1.015733


In [6]:
import random
def split_data_random(data_label_list, train_ratio=0.95):

    # 創建索引列表並隨機打亂
    indices = list(range(len(data_label_list)))
    random.shuffle(indices)

    # 計算分割點
    train_size = int(len(data_label_list) * train_ratio)

    # 分配數據
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:]

    # 根據索引分割數據
    train_data_label_list = [data_label_list[i] for i in train_indices]
    valid_data_label_list = [data_label_list[i] for i in valid_indices]


    return train_data_label_list, valid_data_label_list

In [7]:
data_label_list, _ = spilt_data_with_datetime(df, location_ori)

train_data_label_list, valid_data_label_list = split_data_random(data_label_list)

train_data, train_label, train_length = sort_by_length(train_data_label_list)
valid_data, valid_label, valid_length = sort_by_length(valid_data_label_list)

In [8]:
from torch.nn.utils.rnn import pad_sequence

def padding(data_list, label_list, length, batch=64):

    batch_data_list = []
    batch_label_list = []
    batch_length = []

    for i in range(0, len(data_list), batch):
        upper = min(len(data_list), i + batch)
        data = pad_sequence(data_list[i:upper], batch_first=True, padding_value=0)
        label = pad_sequence(label_list[i:upper], batch_first=True, padding_value=0)
        batch_data_list.append(data)
        batch_label_list.append(label)
        batch_length.append(length[i:upper])
    return batch_data_list, batch_label_list, batch_length



In [12]:

batch_train_data, batch_train_label, batch_train_length = padding(train_data, train_label, train_length)
batch_valid_data, batch_valid_label, batch_valid_length = padding(valid_data, valid_label, valid_length)

batch_train_data[-1].shape


torch.Size([50, 86, 5])

In [7]:
def slice_seq_to_same_length(datas: list, labels: list, length: int):
    train_data = []
    train_label = []
    test_data = []
    test_label = []

    for i in range(len(datas)):
        data = datas[i]
        label = labels[i]
        start = 0
        end = len(data)
        while end - start >= length:
            train_data.append(data[start:start + length])
            train_label.append(label[start:start + length])
            start += length
        if start != end:
            test_data.append(data[start:start + length])
            test_label.append(label[start:start + length])
    return np.array(train_data), np.array(train_label), np.array(test_data, dtype=object), np.array(test_label, dtype=object)


train_data, train_label, test_data, test_label = slice_seq_to_same_length(data_list, label_list, 5)

for data in train_data:
    assert len(data) == 5

train_data.shape, train_label.shape
    

((25430, 5, 5), (25430, 5))

In [8]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import nn

In [10]:
# 將數據轉換為 PyTorch 張量
train_data_tensor = torch.tensor(train_data, dtype=torch.float32)
train_label_tensor = torch.tensor(train_label, dtype=torch.float32)



# 建立 TensorDataset
dataset = TensorDataset(train_data_tensor, train_label_tensor)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# 建立 DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# 檢查形狀
for batch_data, batch_labels in train_loader:
    print(f"Batch data shape: {batch_data.shape}, Batch labels shape: {batch_labels.shape}")
    break

Batch data shape: torch.Size([64, 5, 5]), Batch labels shape: torch.Size([64, 5])


In [11]:
# 將數據轉換為 PyTorch 張量列表，保留每個序列的不同長度
test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]
test_label_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_label]

# 建立自定義 Dataset 用於處理不同長度的序列
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 建立 Test Dataset 和 DataLoader
test_dataset = TestDataset(test_data_tensors, test_label_tensors)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# 檢查測試資料加載情況
for data, label in test_loader:
    print(f"Data shape: {data[0].shape}, Label shape: {label[0].shape}")
    break

Data shape: torch.Size([1, 5]), Label shape: torch.Size([1])


  test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]
  test_label_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_label]


In [12]:
class LSTMTagger(nn.Module):

    def __init__(self, hidden_dim, tagset_size, input_dim=6):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # LSTM層，輸入維度為 input_dim，輸出維度為 hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)

        # 線性層將 LSTM 的輸出映射到標籤空間
        self.linear = nn.Linear(hidden_dim, tagset_size)
        self.relu = nn.ReLU()

    def init_hidden(self, batch_size):
        # 初始化隱藏狀態和細胞狀態
        return (torch.zeros(1, batch_size, self.hidden_dim),
                torch.zeros(1, batch_size, self.hidden_dim))

    def forward(self, sentence, hidden):
        # sentence 的形狀為 (batch_size, seq_len, input_dim)
        # LSTM 層的輸出 lstm_out 形狀為 (batch_size, seq_len, hidden_dim)
        # 並傳回更新後的隱藏狀態
        lstm_out, hidden = self.lstm(sentence, hidden)

        # 使用線性層將 LSTM 的輸出映射到標籤空間
        tag_space = self.relu(self.linear(self.relu(lstm_out)))

        # tag_space 的形狀為 (batch_size, seq_len, tagset_size)
        return tag_space, hidden

In [13]:
import torch
import torch.optim as optim
import torch.nn as nn

# 設置 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定義訓練函數
def train_model(model, train_loader, valid_loader, num_epochs=10, learning_rate=0.001):
    # 將模型移到 GPU
    model = model.to(device, dtype=torch.float32)
    # 使用 Adam 優化器
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # 定義損失函數
    criterion = nn.SmoothL1Loss()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for inputs, labels in train_loader:
            # 將輸入和標籤移到 GPU
            inputs, labels = inputs.to(device, dtype=torch.float32), labels.to(device, dtype=torch.float32)
            
            # 初始化隱藏狀態
            hidden = model.init_hidden(batch_size=inputs.size(0))
            hidden = tuple([h.to(device, dtype=torch.float32) for h in hidden])

            # 清零梯度
            optimizer.zero_grad()
            # 前向傳播
            outputs, _ = model(inputs, hidden)
            # 計算損失
            loss = criterion(outputs.squeeze(), labels)
            # 反向傳播
            loss.backward()
            # 更新參數
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss / len(train_loader):.4f}")

        # 驗證模型
        valid_loss = validate_model(model, valid_loader, criterion)

# 定義驗證函數
def validate_model(model, valid_loader, criterion):
    model.eval()
    total_loss = 0.0
    error = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            # 將輸入和標籤移到 GPU
            inputs, labels = inputs.to(device, dtype=torch.float32), labels.to(device, dtype=torch.float32)
            
            # 初始化隱藏狀態
            hidden = model.init_hidden(batch_size=inputs.size(0))
            hidden = tuple([h.to(device, dtype=torch.float32) for h in hidden])

            # 前向傳播
            outputs, _ = model(inputs, hidden)
            # 計算損失
            loss = criterion(outputs.squeeze(), labels.squeeze())
            error += abs(outputs.view(-1) - labels.view(-1)).sum() / inputs.shape[0] / inputs.shape[1]
            total_loss += loss.item()

    print(f"Validation Loss: {total_loss / len(valid_loader):.4f}, valid error: {error / len(valid_loader)}")
    return total_loss


In [14]:
model = LSTMTagger(256, 1, input_dim=5)
train_model(model, train_loader, valid_loader, 100)


Epoch [1/100], Training Loss: 238.6369
Validation Loss: 232.8065, valid error: 233.26675415039062
Epoch [2/100], Training Loss: 217.8508
Validation Loss: 214.7195, valid error: 215.1526336669922
Epoch [3/100], Training Loss: 200.5550
Validation Loss: 197.7458, valid error: 198.17459106445312
Epoch [4/100], Training Loss: 184.9402
Validation Loss: 182.2517, valid error: 182.684326171875
Epoch [5/100], Training Loss: 170.7321
Validation Loss: 168.0332, valid error: 168.45046997070312
Epoch [6/100], Training Loss: 157.7387
Validation Loss: 155.4504, valid error: 155.8552703857422
Epoch [7/100], Training Loss: 146.1977
Validation Loss: 144.1400, valid error: 144.5362548828125
Epoch [8/100], Training Loss: 136.0700
Validation Loss: 134.6623, valid error: 135.0569610595703
Epoch [9/100], Training Loss: 126.7118
Validation Loss: 124.9030, valid error: 125.29620361328125
Epoch [10/100], Training Loss: 118.1098
Validation Loss: 116.2618, valid error: 116.6490249633789
Epoch [11/100], Training L

In [15]:
criterion = nn.SmoothL1Loss()
model = model.to(device='cuda', dtype=torch.float32)
validate_model(model, test_loader, criterion=criterion)

Validation Loss: 10.5618, valid error: 10.676340103149414


17764.945638533998

In [16]:
model = model.to(device='cpu')
test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]

for x, y in valid_loader:

    with torch.no_grad():
        hidden = model.init_hidden(batch_size=x.size(0))
        predict, _ = model(x, hidden)

        predict_print = (torch.round(predict * 100) / 100).numpy()
        #predict_print = label_normalizer.denormalize(predict_print)
        y_print = (torch.round(y * 100) / 100).numpy()
        np.set_printoptions(precision=2, suppress=True)
        if max(y_print[0]) > 30:

            print("predict:", predict_print[0, :, 0])
            print("label:", y_print[0])
            print()


predict: [127.87 168.04 178.48 131.88 127.86]
label: [142.16 177.88 181.75 128.5  121.56]

predict: [ 373.86 1293.41  496.41  838.75  400.77]
label: [ 424.09 1474.43  389.77  820.32  444.58]

predict: [328.08 440.36 400.33 388.55 433.53]
label: [382.84 536.03 474.4  441.78 485.09]

predict: [682.6  659.43 397.04 332.2  238.  ]
label: [747.16 783.2  465.75 495.9  256.67]

predict: [ 500.67  489.34  845.77 1431.45 1571.62]
label: [ 489.74  453.32  851.91 1436.64 1556.22]

predict: [ 4.21  7.97 12.56 16.41 30.52]
label: [ 5.18  7.5  12.22 19.52 33.89]

predict: [562.57 274.41 144.89 180.23 261.56]
label: [225.71 118.12  61.42  76.44 117.26]

predict: [127.85 106.12  68.44  56.56  52.83]
label: [63.32 64.89 49.77 42.66 41.16]

predict: [744.24 842.68 660.32 216.44 154.41]
label: [1060.98 1114.58 1042.58  454.89  270.38]

predict: [  3.36   8.21  80.62 108.57 165.69]
label: [ 21.88  45.13  68.67 104.84 160.44]

predict: [166.27 217.07 247.   168.6  133.94]
label: [140.61 195.36 215.38 159.5

  test_data_tensors = [torch.tensor(seq, dtype=torch.float32) for seq in test_data]


predict: [ 732.6  1038.06 1539.27 1401.15 1497.84]
label: [ 754.64  986.15 1393.39 1267.34 1398.59]

predict: [33.76 23.5  21.86 27.88 41.1 ]
label: [23.21 16.48 16.57 22.79 36.8 ]

predict: [ 5.6   7.76 15.26 44.37 83.13]
label: [  7.34   9.92  15.84  55.38 103.91]

predict: [ 86.03 104.16 100.5  151.6  166.76]
label: [ 79.    99.32 107.57 164.04 161.65]

predict: [809.92 961.13 880.73 807.45 792.16]
label: [972.66 918.66 874.48 802.47 748.42]

predict: [ 799.8   889.1   979.54 1224.5  1519.91]
label: [1019.88 1146.97 1269.31 1388.88 1489.77]

predict: [40.   36.25  6.18  8.45 58.7 ]
label: [36.25 34.54  4.85  8.11 49.94]

