In [1]:
import pandas as pd
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact
#import plotly.graph_objects as go
import ruptures as rpt
from itertools import combinations as comb
#from statsmodels.stats import power
import numpy as np
from scipy import stats
from scipy.signal import find_peaks

from tqdm import tqdm
#import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from scipy.spatial.distance import cdist
import colorsys

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, KernelPCA
import warnings
warnings.filterwarnings('ignore')

In [2]:
cpd_model = rpt.Binseg(model = 'l2',jump=500, min_size=4500)
def change_point_detection(df, throw_away = 0.1):
    """
    Changepoint Detection
    Input: - df:  from read_data
           - key: which is the filename
           - throw_away: everything smaller than 10% of the total length of data is thrown away

    Returns: - return_dict: A dictionary with all the data and the format key_cnt
    """
    signal = df[['accelerometer_x','accelerometer_y','accelerometer_z']].values
    length = len(df)
    algo = cpd_model.fit(signal)
    result = [0]
    result += algo.predict(pen=1000)
    if result[-1] != length:
        result += [length]
        
    return_df = pd.DataFrame()
    for i in range(len(result)-1):
        if (result[i+1]-result[i]) > 4500:
            if return_df.empty:
                return_df = df.iloc[result[i]:result[i+1]]
            else:
                return_df = pd.concat([return_df, df.iloc[result[i]:result[i+1]]],ignore_index=True)
       
    """
    This is the seperated version, so we dont have windows that belong to different changepoint sections
    return_dict = {}
    cnt = 1
    for i in range(len(result)-1):
        if result[i+1]-result[i] > length*throw_away:
            return_dict[key+'_'+str(cnt)] = df.iloc[result[i]:result[i+1]]
            cnt+=1
    """
    return return_df

In [3]:
folder_path = 'data'
def read_data(filename):
    accelerometer = pd.read_csv(os.path.join(folder_path, filename, 'Accelerometer.csv'),sep=';')
    accelerometer['Time (s)'] = pd.to_datetime(accelerometer['Time (s)'], unit='s')
    accelerometer =accelerometer.set_index('Time (s)')
    accelerometer = accelerometer.resample('2.5ms').mean()
    accelerometer.rename(columns={'Acceleration x (m/s^2)':'accelerometer_x','Acceleration y (m/s^2)':'accelerometer_y','Acceleration z (m/s^2)':'accelerometer_z'}, inplace=True)
    #accelerometer = accelerometer[9000:-9000]
    accelerometer.reset_index(inplace=True)

    
    gyroscope = pd.read_csv(os.path.join(folder_path, filename,'Gyroscope.csv'),sep=';')
    gyroscope['Time (s)'] = pd.to_datetime(gyroscope['Time (s)'], unit='s')
    gyroscope =gyroscope.set_index('Time (s)')
    gyroscope = gyroscope.resample('2.5ms').mean()
    gyroscope.rename(columns={'Gyroscope x (rad/s)':'gyroscope_x','Gyroscope y (rad/s)':'gyroscope_y','Gyroscope z (rad/s)':'gyroscope_z'}, inplace=True)
    #gyroscope = gyroscope[9000:-9000]
    gyroscope.reset_index(inplace=True)
    merged = pd.merge(accelerometer,gyroscope, on= 'Time (s)', how='inner')
    #return merged
    return change_point_detection(merged)
    

In [4]:
# regex to delete the number at the end of the string:
def delete_number(string):
    return ''.join([i for i in string if not i.isdigit()])

In [5]:
data_dict = {}
name_to_idx = {} # name -> idx
i = 0

for name in tqdm(sorted(os.listdir(folder_path))):
    data_dict[name] = read_data(name)
    data_dict[name].set_index('Time (s)', inplace=True)
    data_dict[name].interpolate(inplace=True, method="time")

    # add label
    if delete_number(name) not in name_to_idx.keys():
        name_to_idx[delete_number(name)] = i
        i+=1

100%|███████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:23<00:00,  1.05s/it]


In [6]:
def add_feature(axis_list, axis, X_train, appendix=""):
    #mean
    X_train[axis+'_mean'+appendix] = pd.Series(axis_list).apply(lambda x: x.mean())
    #std dev
    X_train[axis+'_std'+appendix] = pd.Series(axis_list).apply(lambda x: x.std())
    #avg absolute difference
    X_train[axis+'_aad'+appendix] = pd.Series(axis_list).apply(lambda x: np.mean(np.absolute(x - np.mean(x))))
    #min
    X_train[axis+'_min'+appendix] = pd.Series(axis_list).apply(lambda x: x.min())
    #max
    X_train[axis+'_max'+appendix] = pd.Series(axis_list).apply(lambda x: x.max())
    #max-min diff
    X_train[axis+'_maxmin_diff'+appendix] = X_train[axis+'_max'+appendix] - X_train[axis+'_min'+appendix]
    #median
    X_train[axis+'_median'+appendix] = pd.Series(axis_list).apply(lambda x: np.median(x))
    #median absolut deviation
    X_train[axis+'_mad'+appendix] = pd.Series(axis_list).apply(lambda x: np.median(np.absolute(x - np.median(x))))
    #interquartile range
    X_train[axis+'_IQR'+appendix] = pd.Series(axis_list).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))

    if appendix != "_fft":
        #negative count
        X_train[axis+'_neg_count'+appendix] = pd.Series(axis_list).apply(lambda x: np.sum(x < 0))
        #positive count
        X_train[axis+'_pos_count'+appendix] = pd.Series(axis_list).apply(lambda x: np.sum(x > 0))
    
    #values above mean
    X_train[axis+'_above_mean'+appendix] = pd.Series(axis_list).apply(lambda x: np.sum(x > x.mean()))
    #number of peaks
    X_train[axis+'_peak_count'+appendix] = pd.Series(axis_list).apply(lambda x: len(find_peaks(x)[0]))
    #skewness
    X_train[axis+'_skewness'+appendix] = pd.Series(axis_list).apply(lambda x: stats.skew(x))
    #kurtosis
    X_train[axis+'_kurtosis'+appendix] = pd.Series(axis_list).apply(lambda x: stats.kurtosis(x))
    # energy
    X_train[axis+'_energy'+appendix] = pd.Series(axis_list).apply(lambda x: np.sum(x**2)/100)

    return X_train

In [7]:
verification_keys = [] # "nick2", "till2", "uta2", "paula2"

In [8]:
name_to_idx

{'chris': 0,
 'felix': 1,
 'katarina': 2,
 'kirill': 3,
 'leon': 4,
 'leonie': 5,
 'lucas': 6,
 'luisa': 7,
 'melna': 8,
 'nele': 9,
 'nick': 10,
 'paula': 11,
 'rebecca': 12,
 'till': 13,
 'uta': 14}

In [9]:
# Initializing lists
x_list = []
y_list = []
z_list = []

x_val_list = []
y_val_list = []
z_val_list = []
val_labels = []

train_labels = []
x_test_list = []
y_test_list = []
z_test_list = []
test_labels = []

gx_list = []
gy_list = []
gz_list = []

gx_val_list = []
gy_val_list = []
gz_val_list = []

gx_test_list = []
gy_test_list = []
gz_test_list = []

window_size = int(410 * 5) # we give the model 5 steps
step_size = 410

# Creating overlapping windows of size window_size
for name, df_train in data_dict.items():
    label = name_to_idx[delete_number(name)]
    n_train_end = int(df_train.shape[0] * 0.7)  # 70% train
    n_test_end = int(df_train.shape[0] * 0.85)  # next 15% test (and the last 15% for val)

    # Training data
    for i in range(0, n_train_end - window_size, step_size):
        xs = df_train['accelerometer_x'].values[i: i + window_size]
        ys = df_train['accelerometer_y'].values[i: i + window_size]
        zs = df_train['accelerometer_z'].values[i: i + window_size]
        x_list.append(xs)
        y_list.append(ys)
        z_list.append(zs)
        train_labels.append(label)
        
        gxs = df_train['gyroscope_x'].values[i: i + window_size]
        gys = df_train['gyroscope_y'].values[i: i + window_size]
        gzs = df_train['gyroscope_z'].values[i: i + window_size]
        gx_list.append(gxs)
        gy_list.append(gys)
        gz_list.append(gzs)
    
    # Testing data
    for i in range(n_train_end, n_test_end - window_size, step_size):
        xs = df_train['accelerometer_x'].values[i: i + window_size]
        ys = df_train['accelerometer_y'].values[i: i + window_size]
        zs = df_train['accelerometer_z'].values[i: i + window_size]
        x_test_list.append(xs)
        y_test_list.append(ys)
        z_test_list.append(zs)
        test_labels.append(label)

        gxs = df_train['gyroscope_x'].values[i: i + window_size]
        gys = df_train['gyroscope_y'].values[i: i + window_size]
        gzs = df_train['gyroscope_z'].values[i: i + window_size]
        gx_test_list.append(gxs)
        gy_test_list.append(gys)
        gz_test_list.append(gzs)

    # Validation data
    for i in range(n_test_end, df_train.shape[0] - window_size, step_size):
        xs = df_train['accelerometer_x'].values[i: i + window_size]
        ys = df_train['accelerometer_y'].values[i: i + window_size]
        zs = df_train['accelerometer_z'].values[i: i + window_size]
        x_val_list.append(xs)
        y_val_list.append(ys)
        z_val_list.append(zs)
        val_labels.append(label)

        gxs = df_train['gyroscope_x'].values[i: i + window_size]
        gys = df_train['gyroscope_y'].values[i: i + window_size]
        gzs = df_train['gyroscope_z'].values[i: i + window_size]
        gx_val_list.append(gxs)
        gy_val_list.append(gys)
        gz_val_list.append(gzs)


In [10]:
def combine_data(acc_x, acc_y, acc_z, gyro_x, gyro_y, gyro_z):
    combined_data = np.stack([acc_x, acc_y, acc_z, gyro_x, gyro_y, gyro_z], axis=2)
    return combined_data

def min_max_normalize(data):
    min_val = np.nanmin(data, axis=1, keepdims=True)
    max_val = np.nanmax(data, axis=1, keepdims=True)
    normalized_data = (data - min_val) / (max_val - min_val + 1e-8)
    return normalized_data

# Convert lists to numpy arrays
x_train_array = np.array(x_list)
y_train_array = np.array(y_list)
z_train_array = np.array(z_list)
gx_train_array = np.array(gx_list)
gy_train_array = np.array(gy_list)
gz_train_array = np.array(gz_list)
y_train = np.array(train_labels)

x_val_array = np.array(x_val_list)
y_val_array = np.array(y_val_list)
z_val_array = np.array(z_val_list)
gx_val_array = np.array(gx_val_list)
gy_val_array = np.array(gy_val_list)
gz_val_array = np.array(gz_val_list)
y_val = np.array(val_labels)

x_test_array = np.array(x_test_list)
y_test_array = np.array(y_test_list)
z_test_array = np.array(z_test_list)
gx_test_array = np.array(gx_test_list)
gy_test_array = np.array(gy_test_list)
gz_test_array = np.array(gz_test_list)
y_test = np.array(test_labels)

# Combine accelerometer and gyroscope data for each dataset
X_train = combine_data(x_train_array, y_train_array, z_train_array, gx_train_array, gy_train_array, gz_train_array)
X_val = combine_data(x_val_array, y_val_array, z_val_array, gx_val_array, gy_val_array, gz_val_array)
X_test = combine_data(x_test_array, y_test_array, z_test_array, gx_test_array, gy_test_array, gz_test_array)

# Normalize each sample in the datasets
X_train = np.array([min_max_normalize(sample) for sample in X_train])
X_val = np.array([min_max_normalize(sample) for sample in X_val])
X_test = np.array([min_max_normalize(sample) for sample in X_test])

# Shuffle the training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Print shapes to verify
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

X_train, y_train = shuffle(X_train, y_train, random_state=42)


X_train shape: (9239, 2050, 6)
X_val shape: (1903, 2050, 6)
X_test shape: (1903, 2050, 6)
y_train shape: (9239,)
y_val shape: (1903,)
y_test shape: (1903,)


In [11]:
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)

# Window length vs avg F1 score data

Window length, step size:
Weighted avg of: Precision, Recall, F1 score

- 410 * 5, 410
XGBoost 0.99      0.99      0.99, LogisticRegression 0.97      0.97      0.97

- 410 * 4, 410
XGBoost 1.00      1.00      1.00, LogisticRegression 0.96      0.96      0.96

- 410 * 3, 410
XGBoost 1.00      1.00      1.00, LogisticRegression 0.96      0.96      0.96

- 410 * 2, 410
XGBoost 0.99      0.99      0.99, LogisticRegression 0.97      0.96      0.96

- 410 * 1, 410
XGBoost 0.99      0.99      0.99, LogisticRegression 0.91      0.89      0.90

- 410 * 0.1, 410
XGBoost 0.94      0.94      0.94, LogisticRegression 0.56      0.56      0.54

- 410 * 0.01, 410
XGBoost 0.80      0.80      0.79, LogisticRegression 0.30      0.30      0.25

## Autoencoder

In [12]:
"""import tensorflow as tf

# Define the dimensions
input_dim = X_train_flattened.shape[1]
latent_dim = 2

# Define the encoder
encoder = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_dim,)),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(latent_dim)  # No activation for direct latent representation
])

# Define the decoder
decoder = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(latent_dim,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(input_dim, activation='linear')  # Sigmoid to match input range
])

# Combine encoder and decoder into autoencoder
autoencoder = tf.keras.Model(inputs=encoder.input, outputs=decoder(encoder.output))

# Compile the model
autoencoder.compile(optimizer='adam', loss='mae')

# Print summary
autoencoder.summary()

# Train the autoencoder
history = autoencoder.fit(x=X_train_flattened, y=X_train_flattened, epochs=300, batch_size=512, validation_split=0.01)
"""

"import tensorflow as tf\n\n# Define the dimensions\ninput_dim = X_train_flattened.shape[1]\nlatent_dim = 2\n\n# Define the encoder\nencoder = tf.keras.Sequential([\n    tf.keras.layers.Input(shape=(input_dim,)),\n    tf.keras.layers.Dense(1024, activation='relu'),\n    tf.keras.layers.Dense(1024, activation='relu'),\n    tf.keras.layers.Dense(512, activation='relu'),\n    tf.keras.layers.Dense(512, activation='relu'),\n    tf.keras.layers.Dense(256, activation='relu'),\n    tf.keras.layers.Dense(256, activation='relu'),\n    tf.keras.layers.Dense(128, activation='relu'),\n    tf.keras.layers.Dense(128, activation='relu'),\n    tf.keras.layers.Dense(latent_dim)  # No activation for direct latent representation\n])\n\n# Define the decoder\ndecoder = tf.keras.Sequential([\n    tf.keras.layers.Input(shape=(latent_dim,)),\n    tf.keras.layers.Dense(128, activation='relu'),\n    tf.keras.layers.Dense(128, activation='relu'),\n    tf.keras.layers.Dense(256, activation='relu'),\n    tf.keras.

In [13]:
"""plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()"""

"plt.plot(history.history['loss'], label='loss')\nplt.plot(history.history['val_loss'], label='val_loss')\nplt.legend()"

In [14]:
"""encoded_train = encoder.predict(X_train_flattened)

# Plotting the encoded_train (latent representations) with y_train as color
plt.figure(figsize=(10, 8))
sns.scatterplot(x=encoded_train[:, 0], y=encoded_train[:, 1], hue=y_train, palette='tab20', s=50, alpha=0.8)
plt.title('Autoencoder Latent Representations')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.legend(title='Train Label', loc='best')
plt.tight_layout()
plt.show()"""

"encoded_train = encoder.predict(X_train_flattened)\n\n# Plotting the encoded_train (latent representations) with y_train as color\nplt.figure(figsize=(10, 8))\nsns.scatterplot(x=encoded_train[:, 0], y=encoded_train[:, 1], hue=y_train, palette='tab20', s=50, alpha=0.8)\nplt.title('Autoencoder Latent Representations')\nplt.xlabel('Latent Dimension 1')\nplt.ylabel('Latent Dimension 2')\nplt.legend(title='Train Label', loc='best')\nplt.tight_layout()\nplt.show()"

## VAE

In [15]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

2024-07-13 01:02:10.880125: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-13 01:02:10.899337: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
"""# Define the dimensions
input_dim = X_train_flattened.shape[1]
latent_dim = 2

# Sampling function for the latent space
class Sampling(layers.Layer):
    def call(self, inputs):
        mean, log_var = inputs
        batch = tf.shape(mean)[0]
        dim = tf.shape(mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return mean + tf.exp(0.5 * log_var) * epsilon

# Define the encoder
inputs = layers.Input(shape=(input_dim,))
x = layers.Dense(1024, activation='relu')(inputs)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()([z_mean, z_log_var])

encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

# Define the decoder
latent_inputs = layers.Input(shape=(latent_dim,))
x = layers.Dense(256, activation='relu')(latent_inputs)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dense(1024, activation='relu')(x)
outputs = layers.Dense(input_dim, activation='sigmoid')(x)

decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

# Define the VAE
outputs = decoder(z)
vae = Model(inputs, outputs, name='vae')

# Define the VAE loss
reconstruction_loss = tf.keras.losses.mae(inputs, outputs) * input_dim
kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
vae_loss = tf.reduce_mean(reconstruction_loss + 1 * kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
#vae.summary()
"""

"# Define the dimensions\ninput_dim = X_train_flattened.shape[1]\nlatent_dim = 2\n\n# Sampling function for the latent space\nclass Sampling(layers.Layer):\n    def call(self, inputs):\n        mean, log_var = inputs\n        batch = tf.shape(mean)[0]\n        dim = tf.shape(mean)[1]\n        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))\n        return mean + tf.exp(0.5 * log_var) * epsilon\n\n# Define the encoder\ninputs = layers.Input(shape=(input_dim,))\nx = layers.Dense(1024, activation='relu')(inputs)\nx = layers.Dense(1024, activation='relu')(x)\nx = layers.Dense(512, activation='relu')(x)\nx = layers.Dense(512, activation='relu')(x)\nx = layers.Dense(256, activation='relu')(x)\nx = layers.Dense(256, activation='relu')(x)\nz_mean = layers.Dense(latent_dim, name='z_mean')(x)\nz_log_var = layers.Dense(latent_dim, name='z_log_var')(x)\nz = Sampling()([z_mean, z_log_var])\n\nencoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')\nencoder.summary()\n\n# Define

In [17]:
"""# Train the VAE
history = vae.fit(X_train_flattened, X_train_flattened, epochs=100, batch_size=512, validation_split=0.01)"""

'# Train the VAE\nhistory = vae.fit(X_train_flattened, X_train_flattened, epochs=100, batch_size=512, validation_split=0.01)'

In [18]:
"""plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()"""

"plt.plot(history.history['loss'], label='loss')\nplt.plot(history.history['val_loss'], label='val_loss')\nplt.legend()"

In [19]:
"""# Obtain the latent representations (encodings) for X_train
z_mean, _, _ = encoder.predict(X_train_flattened)

# Plot the encodings with y_train as color
plt.figure(figsize=(10, 8))
sns.scatterplot(x=z_mean[:, 0], y=z_mean[:, 1], hue=y_train, palette='tab20', s=50, alpha=0.8)
plt.title('Variational Autoencoder Latent Representations')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.legend(title='Train Label', loc='best')
plt.tight_layout()
plt.show()"""

"# Obtain the latent representations (encodings) for X_train\nz_mean, _, _ = encoder.predict(X_train_flattened)\n\n# Plot the encodings with y_train as color\nplt.figure(figsize=(10, 8))\nsns.scatterplot(x=z_mean[:, 0], y=z_mean[:, 1], hue=y_train, palette='tab20', s=50, alpha=0.8)\nplt.title('Variational Autoencoder Latent Representations')\nplt.xlabel('Latent Dimension 1')\nplt.ylabel('Latent Dimension 2')\nplt.legend(title='Train Label', loc='best')\nplt.tight_layout()\nplt.show()"

## t-SNE

In [20]:
"""# X_top_features = X_test[top_features_]
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_train_flattened)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y_train, palette='tab20', s=50, alpha=0.8)
plt.title('t-SNE Visualization of Top 12 Important Features')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Test Label', loc='best')
plt.tight_layout()
plt.show()"""

"# X_top_features = X_test[top_features_]\ntsne = TSNE(n_components=2, random_state=42, perplexity=30)\nX_tsne = tsne.fit_transform(X_train_flattened)\n\nplt.figure(figsize=(8, 6))\nsns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y_train, palette='tab20', s=50, alpha=0.8)\nplt.title('t-SNE Visualization of Top 12 Important Features')\nplt.xlabel('t-SNE Component 1')\nplt.ylabel('t-SNE Component 2')\nplt.legend(title='Test Label', loc='best')\nplt.tight_layout()\nplt.show()"

In [21]:
"""# Step 2: Apply PCA to reduce dimensionality to 2D
pca = KernelPCA(n_components=2, kernel='poly', degree=10, random_state=42)
# pca = PCA(n_components=2, random_state=0)
X_pca = pca.fit_transform(X_val_flattened)

# Step 3: Plotting
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_val, palette='tab20', s=50, alpha=0.8)
plt.title('PCA Visualization of Top 12 Important Features')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Test Label', loc='best')

# Set x-axis and y-axis limits based on the data
x_min, x_max = X_pca[:, 0].min(), X_pca[:, 0].max()
y_min, y_max = X_pca[:, 1].min(), X_pca[:, 1].max()
plt.xlim(x_min - 0.1*(x_max - x_min), x_max + 0.1*(x_max - x_min))
plt.ylim(y_min - 0.1*(y_max - y_min), y_max + 0.1*(y_max - y_min))

plt.tight_layout()
plt.show()
"""

"# Step 2: Apply PCA to reduce dimensionality to 2D\npca = KernelPCA(n_components=2, kernel='poly', degree=10, random_state=42)\n# pca = PCA(n_components=2, random_state=0)\nX_pca = pca.fit_transform(X_val_flattened)\n\n# Step 3: Plotting\nplt.figure(figsize=(8, 6))\nsns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_val, palette='tab20', s=50, alpha=0.8)\nplt.title('PCA Visualization of Top 12 Important Features')\nplt.xlabel('PCA Component 1')\nplt.ylabel('PCA Component 2')\nplt.legend(title='Test Label', loc='best')\n\n# Set x-axis and y-axis limits based on the data\nx_min, x_max = X_pca[:, 0].min(), X_pca[:, 0].max()\ny_min, y_max = X_pca[:, 1].min(), X_pca[:, 1].max()\nplt.xlim(x_min - 0.1*(x_max - x_min), x_max + 0.1*(x_max - x_min))\nplt.ylim(y_min - 0.1*(y_max - y_min), y_max + 0.1*(y_max - y_min))\n\nplt.tight_layout()\nplt.show()\n"

## LSTM

In [22]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import torchmetrics
import os
import ruptures as rpt


In [23]:
# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_train)

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.Tensor(self.sequences[idx])
        label = torch.Tensor([self.labels[idx]]).long()
        return dict(sequence=sequence, labels=label)

train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=os.cpu_count(),persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count(),persistent_workers=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=os.cpu_count(),persistent_workers=True)


In [24]:
class SequenceModel(nn.Module):
    def __init__(self, n_features, n_classes, n_hidden=512, n_layers=3, dropout=0.5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            num_layers=n_layers,
            batch_first=True,
            dropout=0.5
        )
        self.classifier = nn.Linear(n_hidden, n_classes)

    def forward(self, x):
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        return self.classifier(out)

In [25]:
class LSTMPredictor(pl.LightningModule):
    def __init__(self, n_features, n_classes, lr=0.001, max_epochs=10, n_hidden=512, n_layers=3, dropout=0.5, config=None):
        super().__init__()
        self.save_hyperparameters()
        self.model = SequenceModel(n_features, n_classes, n_hidden, n_layers, dropout)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = self.criterion(output, labels) if labels is not None else torch.tensor(0)
        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_f1 = torchmetrics.functional.f1_score(predictions, labels, task="multiclass", num_classes=self.hparams.n_classes, average='weighted')
        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.log("train_accuracy", step_f1, prog_bar=True, logger=True)
        return {"loss": loss, "f1_score": step_f1}

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_f1 = torchmetrics.functional.f1_score(predictions, labels, task="multiclass", num_classes=self.hparams.n_classes, average='weighted')
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_accuracy", step_f1, prog_bar=True, logger=True)
        return {"loss": loss, "f1_score": step_f1}

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["labels"].squeeze()
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_f1 = torchmetrics.functional.f1_score(predictions, labels, task="multiclass", num_classes=self.hparams.n_classes, average='weighted')
        self.log("test_loss", loss, prog_bar=True, logger=True)
        self.log("test_accuracy", step_f1, prog_bar=True, logger=True)
        return {"loss": loss, "f1_score": step_f1}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

    def configure_callbacks(self):
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath="checkpoints",
            filename="best-checkpoint",
            monitor="val_loss",
            mode="min",
            save_top_k=1,
            verbose=True
        )
        return [] # [checkpoint_callback]

# Training

"""N_EPOCHS = 1
model = LSTMPredictor(
    n_features=X_train.shape[2],
    n_classes=len(label_encoder.classes_),
    lr=0.01,
    max_epochs=3
)

trainer = pl.Trainer(
    max_epochs=model.hparams.max_epochs,
    callbacks=model.configure_callbacks(),
    accelerator="auto"
)

trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)"""


'N_EPOCHS = 1\nmodel = LSTMPredictor(\n    n_features=X_train.shape[2],\n    n_classes=len(label_encoder.classes_),\n    lr=0.01,\n    max_epochs=3\n)\n\ntrainer = pl.Trainer(\n    max_epochs=model.hparams.max_epochs,\n    callbacks=model.configure_callbacks(),\n    accelerator="auto"\n)\n\ntrainer.fit(model, train_loader, val_loader)\ntrainer.test(model, test_loader)'

## Tuning

In [29]:
import wandb
os.environ['WANDB_NOTEBOOK_NAME'] = 'shifted_windows_LSTM_tuning.ipynb'

In [31]:
def train_evaluate(config=None):
    if config is None:
        config = {}  # Initialize config if not provided

    pl.seed_everything(42)  # Ensure reproducibility

    # Initialize wandb
    wandb.init(config=config)
    wandb.config.update(config)

    # Create LightningModule instance
    model = LSTMPredictor(
        n_features=X_train.shape[2],
        n_classes=len(label_encoder.classes_),
        max_epochs=config.get('max_epochs', 10),
        dropout=config.get('dropout', 0.5)
    )

    # Initialize Lightning Trainer
    trainer = pl.Trainer(
        max_epochs=config.get('max_epochs', 10),
        callbacks=model.configure_callbacks(),
        logger=pl.loggers.WandbLogger(),
        accelerator="auto",
    )

    # Train the model
    trainer.fit(model, train_loader, val_loader)

    # Test the model
    trainer.test(model, test_loader)

    # Close wandb at the end of the run
    wandb.finish()

# Define sweep configuration
sweep_config = {
    'method': 'grid',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'max_epochs': {
            'value': 10
        },
        'dropout': {
            'values': [0.3, 0.5, 0.7]
        }
    }
}

# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, entity='ds4w', project='ds4w')

# Perform the sweep
wandb.agent(sweep_id, function=train_evaluate, count=3)

Create sweep with ID: 7ekwzu6g
Sweep URL: https://wandb.ai/ds4w/ds4w/sweeps/7ekwzu6g


[34m[1mwandb[0m: Agent Starting Run: akqzt1la with config:
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	max_epochs: 10
Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | SequenceModel    | 5.3 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
5.3 M     Trainable params
0         Non-trainable params
5.3 M     Total params
21.101    Total estimated model params size (MB)


Sanity Checking: |                                                                                        | 0/…

Training: |                                                                                               | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

Validation: |                                                                                             | 0/…

`Trainer.fit` stopped: `max_epochs=10` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                | 0/…

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.14883427321910858
        test_loss           2.6056838035583496
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▂▂▃▃▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇█
test_accuracy,▁
test_loss,▁
train_accuracy,▃█▄▅▅▄▁▄▇▇▂█▂▂
train_loss,▆█▁▆▇▇█▇▅▆▆▆▇▆
trainer/global_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇███
val_accuracy,▄▁████████
val_loss,▁▅████████

0,1
epoch,10.0
test_accuracy,0.14883
test_loss,2.60568
train_accuracy,0.04625
train_loss,2.55728
trainer/global_step,730.0
val_accuracy,0.15045
val_loss,2.58999


In [32]:
wandb.agent(sweep_id, function=train_evaluate, count=3)

[34m[1mwandb[0m: Agent Starting Run: 1ljvo44c with config:
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	max_epochs: 10
Seed set to 42
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
