In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import torchmetrics
import os
import ruptures as rpt

def delete_number(string):
    return ''.join([i for i in string if not i.isdigit()])

def create_sliding_windows(data, window_length, stride, label):
    labels = np.ones(data.shape[0])*label
    num_windows = (len(data) - window_length) // stride + 1

    windows = np.lib.stride_tricks.sliding_window_view(data, (window_length, data.shape[1]))
    windows = windows[::stride, 0, :, :]
    y_windows = labels[np.arange(0, num_windows * stride, stride)]  # Class labels at the start of each window

    return windows, y_windows



  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
cpd_model = rpt.Binseg(model = 'l2',jump=500, min_size=4500)
def change_point_detection(df, throw_away = 0.1):
    """
    Changepoint Detection
    Input: - df:  from read_data
           - key: which is the filename
           - throw_away: everything smaller than 10% of the total length of data is thrown away

    Returns: - return_dict: A dictionary with all the data and the format key_cnt
    """
    signal = df[['accelerometer_x','accelerometer_y','accelerometer_z']].values
    length = len(df)
    algo = cpd_model.fit(signal)
    result = [0]
    result += algo.predict(pen=1000)
    if result[-1] != length:
        result += [length]
        
    return_df = pd.DataFrame()
    for i in range(len(result)-1):
        if result[i+1]-result[i] > length*throw_away:
            if return_df.empty:
                return_df = df.iloc[result[i]:result[i+1]]
            else:
                pd.concat([return_df, df.iloc[result[i]:result[i+1]]],ignore_index=True)
            
    """
    This is the seperated version, so we dont have windows that belong to different changepoint sections
    return_dict = {}
    cnt = 1
    for i in range(len(result)-1):
        if result[i+1]-result[i] > length*throw_away:
            return_dict[key+'_'+str(cnt)] = df.iloc[result[i]:result[i+1]]
            cnt+=1
    """
    return return_df

In [3]:
folder_path = 'data'
def read_data(filename):
    accelerometer = pd.read_csv(os.path.join(folder_path, filename, 'Accelerometer.csv'),sep=';')
    accelerometer['Time (s)'] = pd.to_datetime(accelerometer['Time (s)'], unit='s')
    accelerometer = accelerometer.set_index('Time (s)')
    accelerometer = accelerometer.resample('2.5ms').mean()
    accelerometer.rename(columns={'Acceleration x (m/s^2)':'accelerometer_x','Acceleration y (m/s^2)':'accelerometer_y','Acceleration z (m/s^2)':'accelerometer_z'}, inplace=True)
    #accelerometer =accelerometer[9000:-9000]
    accelerometer.reset_index(inplace=True)

    gyroscope = pd.read_csv(os.path.join(folder_path, filename,'Gyroscope.csv'),sep=';')
    gyroscope['Time (s)'] = pd.to_datetime(gyroscope['Time (s)'], unit='s')
    gyroscope =gyroscope.set_index('Time (s)')
    gyroscope = gyroscope.resample('2.5ms').mean()
    gyroscope.rename(columns={'Gyroscope x (rad/s)':'gyroscope_x','Gyroscope y (rad/s)':'gyroscope_y','Gyroscope z (rad/s)':'gyroscope_z'}, inplace=True)
    #gyroscope = gyroscope[9000:-9000]
    gyroscope.reset_index(inplace=True)
    merged = pd.merge(accelerometer,gyroscope, on= 'Time (s)', how='inner')
    return change_point_detection(merged)

In [4]:
# Adjust these paths as needed
data_path = 'data'  # Path to your new dataset
verification_keys = ["nick2", "till2", "uta2", "paula2"]

data_dict = {}
name_to_idx = {}  # name -> idx
i = 0

for name in os.listdir(data_path):
    data_dict[name] = read_data(name)  # Assuming read_data processes the files correctly
    data_dict[name].drop(columns=['Time (s)'], inplace=True)
    if delete_number(name) not in name_to_idx.keys():
        name_to_idx[delete_number(name)] = i
        i += 1

# Adjust window length and stride as needed
window_length = 1620
stride = 1000

X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

for name, df in data_dict.items():
    if name in verification_keys:
        continue
    label = name_to_idx[delete_number(name)]
    train = df[:int(0.8 * len(df))]
    train_normalized = (train - train.min()) / (train.max() - train.min())

    test = df[int(0.8 * len(df)):]
    test_normalized = (test - train.min()) / (train.max() - train.min())

    X_train, y_train = create_sliding_windows(train_normalized, window_length, stride, label)
    X_test, y_test = create_sliding_windows(test_normalized, window_length, stride, label)

    X_train_list.append(X_train)
    y_train_list.append(y_train)
    X_test_list.append(X_test)
    y_test_list.append(y_test)

X_train = np.concatenate(X_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
X_test = np.concatenate(X_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)

# Verify shapes
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_train)

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.Tensor(self.sequences[idx])
        label = torch.Tensor([self.labels[idx]]).long()
        return dict(sequence=sequence, labels=label)

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=os.cpu_count(),persistent_workers=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count(),persistent_workers=True)


(1593, 1620, 6) (1593,) (386, 1620, 6) (386,)


In [5]:
class SequenceModel(nn.Module):
    def __init__(self, n_features, n_classes, n_hidden=512, n_layers=3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            num_layers=n_layers,
            batch_first=True,
            dropout=0.5
        )
        self.classifier = nn.Linear(n_hidden, n_classes)

    def forward(self, x):
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        return self.classifier(out)

In [6]:
class SurfacePredictor(pl.LightningModule):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.model = SequenceModel(n_features, n_classes)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=len(label_encoder.classes_))
        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.log("train_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=len(label_encoder.classes_))
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=len(label_encoder.classes_))
        self.log("test_loss", loss, prog_bar=True, logger=True)
        self.log("test_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)

# Training
N_EPOCHS = 300
model = SurfacePredictor(n_features=X_train.shape[2], n_classes=len(label_encoder.classes_))

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

trainer = pl.Trainer(
    max_epochs=N_EPOCHS,
    callbacks=[checkpoint_callback],
    accelerator="auto",
    gpus=1 if torch.cuda.is_available() else 0,)

trainer.fit(model, train_loader, test_loader)
trainer.test(model, test_dataloaders=test_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | SequenceModel    | 5.3 M  | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
5.3 M     Trainable params
0         Non-trainable params
5.3 M     Total params
21.094    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]