## 对时间序列数据进行采样

In [30]:
import pandas as pd

In [31]:
df = pd.read_pickle('/home/user/suzhao/BehaviorDL/data/heartrate.pkl.zip')\
    [['externalid', 'recordtime', 'avgHeartRate']]
df['recordtime'] = pd.to_datetime(df['recordtime'], unit='ms') + pd.Timedelta(hours=8)
df['day'] = df['recordtime'].dt.date
df.dtypes

externalid               int64
recordtime      datetime64[ns]
avgHeartRate             int64
day                     object
dtype: object

In [46]:
df_list = []
for id, grp_id in df.groupby('externalid'):
    for day, grp_id_day in grp_id.groupby('day'):
        # print(id, day)
        full_day_range = pd.date_range(start=day, \
                               end=day + pd.Timedelta(days=1) - pd.Timedelta(minutes=1), freq='T')
        full_day_df = pd.DataFrame({'recordtime': full_day_range})
        full_day_df = pd.merge(full_day_df, grp_id_day, on='recordtime', how='left')
        full_day_df.externalid = id
        full_day_df.day = day
        df_list.append(full_day_df)


full_df = pd.concat(df_list)

0 2021-11-13
0 2021-11-14
1 2021-11-13
1 2021-11-14
1 2021-11-15
1 2021-11-16
1 2021-11-17
1 2021-11-18
1 2021-11-19
1 2021-11-20
1 2021-11-21
1 2021-11-22
1 2021-11-23
1 2021-11-24
1 2021-11-25
1 2021-11-26
1 2021-11-27
1 2021-11-28
1 2021-11-29
1 2021-11-30
1 2021-12-01
1 2021-12-02
1 2021-12-03
1 2021-12-04
1 2021-12-05
1 2021-12-06
1 2021-12-07
1 2021-12-08
1 2021-12-09
1 2021-12-10
1 2021-12-11
1 2021-12-12
1 2021-12-13
1 2021-12-14
2 2021-11-13
2 2021-11-14
2 2021-11-15
2 2021-11-16
2 2021-11-17
2 2021-11-18
2 2021-11-19
3 2021-11-13
3 2021-11-14
3 2021-11-15
3 2021-11-16
3 2021-11-17
3 2021-11-18
3 2021-11-19
3 2021-11-20
3 2021-11-21
3 2021-11-22
3 2021-11-23
3 2021-11-24
3 2021-11-25
3 2021-11-26
3 2021-11-27
3 2021-11-28
3 2021-11-29
3 2021-11-30
3 2021-12-01
3 2021-12-02
3 2021-12-03
3 2021-12-04
3 2021-12-05
3 2021-12-06
3 2021-12-07
3 2021-12-08
3 2021-12-09
3 2021-12-10
3 2021-12-11
3 2021-12-12
3 2021-12-13
3 2021-12-14
4 2021-11-13
4 2021-11-14
4 2021-11-15
4 2021-11-16

## 测试dataloader

In [1]:
from data_provider.data_loader import HuaweiDataset, data_provider
from dataclasses import dataclass
import pandas as pd

In [2]:
@dataclass
class args:
    data_path: str
    label_flag: str

args.data_path = '/home/user/suzhao/BehaviorDL/data'
args.label_flag = 'emotion'

data_set, data_loader = data_provider(args.data_path, args.label_flag)

In [3]:
for i, batch in enumerate(data_loader):
    print(batch[0].shape)
    print(batch[1])
    break

torch.Size([20, 1441, 4])
tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0])


## 训练

In [2]:
from data_provider.data_loader import HuaweiDataset, data_provider, data_loader
from dataclasses import dataclass
from torch.utils.data import random_split, DataLoader
from models import PatchTST
import torch
import torch.nn as nn
from torch import optim
import pandas as pd


@dataclass
class args:
    data_path: str
    label_flag: str
    epochs: int
    seq_len: int
    d_model: int
    dropout: float
    e_layers: int
    d_ff: int
    output_attention: bool
    num_class: int = 2
    activation: str = 'gelu'
    factor: float = 1
    n_heads: int = 8
    enc_in: int = 7


def padding_mask(lengths, max_len=None):
    """
    Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths,
    where 1 means keep element at this position (time step)
    """
    batch_size = lengths.numel()
    max_len = max_len or lengths.max_val()  # trick works because of overloading of 'or' operator for non-boolean types
    return (torch.arange(0, max_len, device=lengths.device)
            .type_as(lengths)
            .repeat(batch_size, 1)
            .lt(lengths.unsqueeze(1)))

def collate_fn(data, max_len=None):
    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
    Args:
        data: len(batch_size) list of tuples (X, y).
            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
            - y: torch tensor of shape (num_labels,) : class indices or numerical targets
                (for classification or regression, respectively). num_labels > 1 for multi-task models
        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
    Returns:
        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding
    """

    batch_size = len(data)
    features, labels = zip(*data)

    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
    if max_len is None:
        max_len = max(lengths)

    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]

    targets = torch.stack(labels, dim=0)  # (batch_size, num_labels)

    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16),
                                 max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep

    return X, targets, padding_masks


args.data_path = '/home/user/suzhao/BehaviorDL/dataset/Huawei'
args.label_flag = 'emotion'
args.epochs = 10
args.seq_len = 1441
args.d_model = 64
args.dropout = 0.6
args.e_layers = 3
args.d_ff = 128
args.output_attention = False
args.activation = 'gelu'
args.factor = 1
args.n_heads = 8
args.num_class = 2
args.enc_in = 7

In [3]:
data_set = HuaweiDataset(args.data_path, args.label_flag)

train_size = int(0.8 * len(data_set))
test_size = len(data_set) - train_size
train_dataset, test_dataset = random_split(data_set, [train_size, test_size])

train_loader = data_loader(train_dataset, flag='train')
test_loader = data_loader(test_dataset, flag='test')

model = PatchTST.Model(args).float()

train_steps = len(train_loader)
model_optim = optim.Adam(model.parameters(), lr=0.02)
criterion = nn.CrossEntropyLoss()

for epoch in range(args.epochs):

    
    for i, (batch_x, label, padding_mask) in enumerate(train_loader):
        model_optim.zero_grad()
        batch_x = batch_x.float()

        padding_mask = padding_mask.float()

        output = model(batch_x, padding_mask)
        loss = criterion(output, label)
        loss.backward()
        model_optim.step()
        
        if i % 10 == 0:
            print(f'Epoch {epoch}, Step {i}, Loss {loss.item()}')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (20x47872 and 80640x2)

## 使用Heartbeat数据集进行训练

In [4]:
from dataclasses import dataclass
@dataclass
class args:
    data_path: str
    label_flag: str
    epochs: int
    seq_len: int
    d_model: int
    dropout: float
    e_layers: int
    d_ff: int
    output_attention: bool
    num_class: int = 2
    activation: str = 'gelu'
    factor: float = 1
    n_heads: int = 8
    enc_in: int = 7
args.data_path = '/home/user/suzhao/BehaviorDL/dataset/Huawei'
args.label_flag = 'emotion'
args.epochs = 10
args.seq_len = 1441
args.d_model = 64
args.dropout = 0.6
args.e_layers = 3
args.d_ff = 128
args.output_attention = False
args.activation = 'gelu'
args.factor = 1
args.n_heads = 8
args.num_class = 2
args.enc_in = 7

In [9]:
from data_provider.data_loader import HuaweiDataset, UEAloader, DataLoader
from data_provider.uea import collate_fn

from torch.utils.data import random_split
from models import PatchTST
from torch import optim
from torch import nn

import numpy as np
import time


data_set = UEAloader('/home/user/suzhao/BehaviorDL/dataset/Heartbeat')

train_size = int(0.8 * len(data_set))
test_size = len(data_set) - train_size
train_dataset, test_dataset = random_split(data_set, [train_size, test_size])

train_loader = DataLoader(
    train_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(
    test_dataset, batch_size=10, shuffle=False, collate_fn=collate_fn)

model = PatchTST.Model(args).float()

train_steps = len(train_loader)
model_optim = optim.Adam(model.parameters(), lr=0.02)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    iter_count = 0
    train_loss = []

    model.train()
    epoch_time = time.time()

    for i, (batch_x, label, padding_mask) in enumerate(train_loader):
        model_optim.zero_grad()

        batch_x = batch_x.float()
        padding_mask = padding_mask.float()

        output = model(batch_x, padding_mask)
        loss = criterion(output, label.long().squeeze(-1))

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0)
        model_optim.step()
    
    print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))


204


RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x195200 and 80640x2)