In [1]:
import pandas as pd
import time
import torch
import pickle
from torch import nn, optim
from torch.utils.data import DataLoader

from sklearn.preprocessing import StandardScaler

from evd.data import TimeseriesDataset
from evd.model import LSTMModel
from evd.training import trainer, EarlyStopping

print(torch.__version__)  
print(torch.version.cuda) 

2.4.1+cu124
12.4


<font size = 6> Data preprocessing </font>

In [2]:
start = time.time()

data1 = pd.read_parquet(r'C:\Users\4019-tjyen\Desktop\child-mind-institute-detect-sleep-states\train_series.parquet')
data2 = pd.read_csv(r'C:\Users\4019-tjyen\Desktop\child-mind-institute-detect-sleep-states\train_events.csv')

merge_data = pd.merge(data1, data2, on=["series_id","timestamp"], how="left", indicator=True)

merge_data = merge_data.drop(columns=['night', 'step_y'])

load_time = time.time()-start

print('loading time: {:>6.2f} secs.'.format(load_time))

merge_data["event"].fillna(0, inplace=True)
merge_data["event"] = merge_data["event"].replace({"onset": 1, "wakeup": 2})

missing_values = merge_data.isnull().sum()


print(merge_data.head())
print(merge_data['event'].unique()) 
print(merge_data[["timestamp", "series_id", "event"]].head())
print(merge_data[["anglez", "enmo"]].isnull().sum())
print(missing_values)

loading time: 138.77 secs.
      series_id  step_x                 timestamp  anglez    enmo  event  \
0  038441c925bb       0  2018-08-14T15:30:00-0400  2.6367  0.0217      0   
1  038441c925bb       1  2018-08-14T15:30:05-0400  2.6368  0.0215      0   
2  038441c925bb       2  2018-08-14T15:30:10-0400  2.6370  0.0216      0   
3  038441c925bb       3  2018-08-14T15:30:15-0400  2.6368  0.0213      0   
4  038441c925bb       4  2018-08-14T15:30:20-0400  2.6368  0.0215      0   

      _merge  
0  left_only  
1  left_only  
2  left_only  
3  left_only  
4  left_only  
[0 1 2]
                  timestamp     series_id  event
0  2018-08-14T15:30:00-0400  038441c925bb      0
1  2018-08-14T15:30:05-0400  038441c925bb      0
2  2018-08-14T15:30:10-0400  038441c925bb      0
3  2018-08-14T15:30:15-0400  038441c925bb      0
4  2018-08-14T15:30:20-0400  038441c925bb      0
anglez    0
enmo      0
dtype: int64
series_id    0
step_x       0
timestamp    0
anglez       0
enmo         0
event       

In [3]:
# 檢查是否有NaN值
nan_check = merge_data.isna()

# 查看每一列是否有NaN值的數量
nan_count = nan_check.sum()

# 查看是否有任何NaN值
has_nan = nan_check.any().any()

# 輸出結果
print("每一列的NaN數量：\n", nan_count)
print("是否有NaN值：", has_nan)

每一列的NaN數量：
 series_id    0
step_x       0
timestamp    0
anglez       0
enmo         0
event        0
_merge       0
dtype: int64
是否有NaN值： False


In [4]:
scaler = StandardScaler()
merge_data[["anglez","enmo"]] = scaler.fit_transform(merge_data[["anglez", "enmo"]])
print(merge_data[["anglez","enmo"]].head())

     anglez      enmo
0  0.322257 -0.192628
1  0.322260 -0.194592
2  0.322266 -0.193610
3  0.322260 -0.196556
4  0.322260 -0.194592


<font size = 6> Selecting a subject </font>

In [5]:
individual_data = merge_data[merge_data['series_id']=='038441c925bb']
#the numbers of features 
print("數量", individual_data.shape[1] -1)

數量 6


<font size = 6> Setting hyperparameters </font>

In [6]:
seq_len = 128
train_idx = 300000 ## First 300000 timesteps as training data and the rest as validation data

input_dim = 4
hidden_dim = 64
num_layers = 3
num_classes = 3
batch_size = 128
num_epochs = 100
patience = 10

lr = 0.005

<font size = 6> Setting GPU </font>

In [7]:
#check GPU  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#x = torch.rand(1000, 1000, device=device)
#y = torch.matmul(x, x)
#print(y)  # 應在 GPU 上執行

In [8]:
#AngleZ change

individual_data['anglez_change'] = individual_data['anglez'] - individual_data['anglez'].shift(1)

#Enmo change

individual_data['enmo_change'] = individual_data["enmo"] - individual_data['enmo'].shift(1)

individual_data['enmo_change'].iloc[0] = 0
individual_data['anglez_change'].iloc[0] = 0

print(individual_data.head())

individual_data.to_csv("individual_data.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  individual_data['anglez_change'] = individual_data['anglez'] - individual_data['anglez'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  individual_data['enmo_change'] = individual_data["enmo"] - individual_data['enmo'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  individual_data['enmo_change'].iloc[0] = 0
A value is trying to be set o

      series_id  step_x                 timestamp    anglez      enmo  event  \
0  038441c925bb       0  2018-08-14T15:30:00-0400  0.322257 -0.192628      0   
1  038441c925bb       1  2018-08-14T15:30:05-0400  0.322260 -0.194592      0   
2  038441c925bb       2  2018-08-14T15:30:10-0400  0.322266 -0.193610      0   
3  038441c925bb       3  2018-08-14T15:30:15-0400  0.322260 -0.196556      0   
4  038441c925bb       4  2018-08-14T15:30:20-0400  0.322260 -0.194592      0   

      _merge  anglez_change  enmo_change  
0  left_only       0.000000     0.000000  
1  left_only       0.000003    -0.001964  
2  left_only       0.000006     0.000982  
3  left_only      -0.000006    -0.002946  
4  left_only       0.000000     0.001964  


<font size = 6> Datasets </font>

In [9]:
X = torch.tensor(individual_data[['anglez','enmo','anglez_change', 'enmo_change']].values, dtype=torch.float32)
y = torch.tensor(individual_data['event'].values, dtype=torch.int64)


train_dataset = TimeseriesDataset(X=X[:train_idx],
                                  y=y[:train_idx],
                                  seq_len=seq_len,
                                  transform=None)

valid_dataset = TimeseriesDataset(X=X[train_idx:],
                                  y=y[train_idx:],
                                  seq_len=seq_len,
                                  transform=None)


train_loader = DataLoader(train_dataset, 
                          batch_size = batch_size, 
                          drop_last=True,
                          shuffle = True)

valid_loader = DataLoader(valid_dataset, 
                          batch_size = batch_size, 
                          drop_last=True,
                          shuffle = False)


In [10]:
#save valid_loader
with open('valid_loader.pkl', 'wb') as f:
    pickle.dump(valid_loader, f)

<font size = 6> Model </font>

In [11]:
model = LSTMModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers, num_classes=num_classes).to(device)
print(model)

LSTMModel(
  (lstm): LSTM(4, 64, num_layers=3)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)


<font size = 6> Training </font>

In [139]:

loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
early_stopping = EarlyStopping(patience=patience, verbose=True)
 

In [140]:
trainer(num_epochs=num_epochs, 
        model=model, 
        loss=loss,
        optimizer=optimizer,
        train_loader=train_loader,
        valid_loader=valid_loader,
        early_stopping=early_stopping,
        device=device)

Epoch:  14; valid iteration:  701; time:   0.71 secs; loss: 0.0050
 Early stopping triggered.

 Early stopping at epoch  14


In [141]:
torch.save(model.state_dict(), "train_model.pth")