# Installations and Imports

In [1]:
%pip install imblearn

Collecting imblearnNote: you may need to restart the kernel to use updated packages.

  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:
%pip install mlxtend

Collecting mlxtendNote: you may need to restart the kernel to use updated packages.

  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.4 MB 16.8 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.4 MB 18.4 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 13.2 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.1


In [1]:
import pandas as pd
import numpy as np
import torch
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, roc_auc_score
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

# Random Forest Model (using Exhaustive Search)

In [None]:
agg_sleep_df = pd.read_csv('..\Dataset\Aggregated_Sleep.csv')
# Load the additional CSV file
additional_df = pd.read_csv('..\Dataset\Synthetic_Sleep_Anomaly_CTGAN.csv')
# Concatenate the original DataFrame with the new DataFrame
agg_sleep_df = pd.concat([agg_sleep_df, additional_df], axis=0, ignore_index=True)
# Shuffle the DataFrame
agg_sleep_df = agg_sleep_df.sample(frac=1, random_state=42).reset_index(drop=True)
agg_sleep_df = agg_sleep_df.dropna()

X = agg_sleep_df.loc[:,~agg_sleep_df.columns.isin(['patient_id','window_start','agitation'])]
y = agg_sleep_df['agitation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to oversample the minority class
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
efs = ExhaustiveFeatureSelector(RandomForestClassifier(),min_features = 4, max_features=10, scoring='f1')

efs = efs.fit(X_train, y_train)

selected_features = X_train.columns[list(efs.best_idx_)] 
print(selected_features)

print(efs.best_score_)

Features: 848/848

Index(['mean_RR', 'TST', 'SE', 'snoring_counts'], dtype='object')
0.08


In [None]:
# Retrain the model with the best features on the resampled training set
classifier = RandomForestClassifier()
classifier.fit(X_train_resampled[selected_features], y_train_resampled)
# Ensure the test set is also limited to the selected features
X_test_selected = X_test[selected_features]

# Predict the responses for the test set
y_pred = classifier.predict(X_test_selected)
# Generating the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.98      0.87      0.92       181
         1.0       0.04      0.25      0.07         4

    accuracy                           0.86       185
   macro avg       0.51      0.56      0.50       185
weighted avg       0.96      0.86      0.91       185



# Data Preparation (Building a custom train-test split for our use-case)

In [36]:
# Load the dataset
data = pd.read_csv('..\Dataset\Aggregated_Sleep.csv')
data['source'] = 'real'  # Add a source column for original data

# Load the synthetic data
synthetic_data = pd.read_csv('..\Dataset\Synthetic_Sleep_Anomaly_CTGAN.csv')
synthetic_data['source'] = 'synthetic'  # Add a source column for synthetic data

# Concatenate the original DataFrame with the new DataFrame
data = pd.concat([data, synthetic_data], axis=0, ignore_index=True)

# Shuffle the DataFrame
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data = data.drop(['patient_id', 'window_start'], axis=1)
data.dropna(inplace=True)

# Splitting the data based on the 'agitation' column
positive_data = data[data['agitation'] == 1]
negative_data = data[data['agitation'] == 0]

# Selecting test data from the original file for positive cases
test_pos = positive_data[positive_data['source'] == 'real'].tail(13)  # Last 17 from original data for testing (near 20% split)
# Remaining positive data from the real source (excluding those chosen for testing)
train_pos_real = positive_data[positive_data['source'] == 'real'].drop(test_pos.index)

# Selecting positive synthetic data for training
train_pos_synthetic = positive_data[positive_data['source'] == 'synthetic']

# Combine real and synthetic positive data for training
train_pos_combined = pd.concat([train_pos_real, train_pos_synthetic], ignore_index=True)

# Oversampling positive samples in the training set by 2x
train_pos_oversampled = pd.concat([train_pos_combined] * 2, ignore_index=True)

# Splitting negative data, random selection for the test set
train_neg, test_neg = train_test_split(negative_data, test_size=0.2, random_state=42)

# Matching the number of oversampled positive samples with negative samples in a 1:3 ratio
train_neg_matched = train_neg.sample(n=len(train_pos_oversampled)*3, random_state=42)

# Combining the matched training sets
X_train = pd.concat([train_pos_oversampled, train_neg_matched])
y_train = X_train['agitation']
X_train = X_train.drop(['agitation', 'source'], axis=1)

# Preparing the test set (stratify sampling manually to preserve the 1:3 ratio)
test_neg_selected = test_neg.sample(n=80, random_state=42)  # Adjust n for desired test size
X_test = pd.concat([test_pos, test_neg_selected])
y_test = X_test['agitation']
X_test = X_test.drop(['agitation', 'source'], axis=1)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
train_pos_combined.shape

(54, 12)

In [41]:
test_pos.shape

(13, 12)

In [42]:
train_neg_matched.shape

(324, 12)

In [43]:
test_neg_selected.shape

(80, 12)

# Model Training and Evaluation

## Random Forest, One-Class SVM, Isolation Forest

In [37]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)
rf_predictions = rf_classifier.predict(X_test_scaled)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# One-Class SVM
one_class_svm = OneClassSVM(kernel='rbf', gamma='auto')
one_class_svm.fit(X_train_scaled[y_train == 0])  # Train only on normal data for typical usage
svm_predictions = one_class_svm.predict(X_test_scaled)
svm_predictions = (svm_predictions == -1).astype(int)
print("One-Class SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# Isolation Forest
iso_forest = IsolationForest(n_estimators=200, contamination=float(np.mean(y_train == 1)), random_state=42)
iso_forest.fit(X_train_scaled)
if_predictions = iso_forest.predict(X_test_scaled)
if_predictions = (if_predictions == -1).astype(int)
print("Isolation Forest Classification Report:")
print(classification_report(y_test, if_predictions))

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.99      0.92        80
         1.0       0.00      0.00      0.00        13

    accuracy                           0.85        93
   macro avg       0.43      0.49      0.46        93
weighted avg       0.74      0.85      0.79        93

One-Class SVM Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.49      0.61        80
         1.0       0.11      0.38      0.17        13

    accuracy                           0.47        93
   macro avg       0.47      0.44      0.39        93
weighted avg       0.73      0.47      0.55        93

Isolation Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.70      0.77        80
         1.0       0.11      0.23      0.15        13

    accuracy                           0.63        93
   macro avg  

## Autoencoders (for anomaly detection)

In [38]:
# 1. Define the Autoencoder Architecture
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(X_train.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, X_train.shape[1]),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# 2. Prepare Data Loaders
train_data = TensorDataset(torch.tensor(X_train_scaled).float())
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# 3. Initialize the Autoencoder and Optimizer
model = Autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 4. Train the Autoencoder
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for data in train_loader:
        inputs = data[0]
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# 5. Evaluate the Model for Anomaly Detection
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_scaled).float()
    reconstructed = model(X_test_tensor)
    mse = torch.mean((X_test_tensor - reconstructed) ** 2, dim=1)
    anomaly_threshold = np.percentile(mse.numpy(), 95)  # Adjust based on your preference
    test_predictions = (mse > anomaly_threshold).int().numpy()

# 6. Print Classification Report
print("Autoencoder Anomaly Detection Report:")
print(classification_report(y_test, test_predictions))

Epoch 1, Loss: 0.4859554171562195
Epoch 2, Loss: 1.5452849864959717
Epoch 3, Loss: 0.9651633501052856
Epoch 4, Loss: 0.7909493446350098
Epoch 5, Loss: 0.4672658443450928
Epoch 6, Loss: 0.6086257696151733
Epoch 7, Loss: 0.9460229873657227
Epoch 8, Loss: 0.4436676502227783
Epoch 9, Loss: 0.8346527814865112
Epoch 10, Loss: 0.4320221543312073
Epoch 11, Loss: 0.6609364748001099
Epoch 12, Loss: 0.6512584090232849
Epoch 13, Loss: 0.4657561779022217
Epoch 14, Loss: 0.25464311242103577
Epoch 15, Loss: 0.37018486857414246
Epoch 16, Loss: 0.36595091223716736
Epoch 17, Loss: 0.30109405517578125
Epoch 18, Loss: 0.24488189816474915
Epoch 19, Loss: 0.3811655640602112
Epoch 20, Loss: 0.44289645552635193
Autoencoder Anomaly Detection Report:
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90        80
         1.0       0.20      0.08      0.11        13

    accuracy                           0.83        93
   macro avg       0.53      0.51      0.51    

## Tabular Neural Network (with weighted loss) - Best performing model

Run the custom train-test codeblock before running this

In [39]:
# Calculate class weights
class_counts = y_train.value_counts()
total_samples = len(y_train)
weights = [total_samples / class_counts[i] for i in range(len(class_counts))]
class_weights = torch.tensor(weights).float()

# Define the Neural Network Architecture
class TabularModel(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_layers):
        super(TabularModel, self).__init__()
        layers = []
        for i in hidden_layers:
            layers.append(nn.Linear(num_inputs, i))
            layers.append(nn.ReLU(inplace=True))
            num_inputs = i
        layers.append(nn.Linear(num_inputs, num_outputs))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

# Instantiate the model, criterion (now weighted), and optimizer
model = TabularModel(X_train_scaled.shape[1], 2, [50, 100, 50])
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare DataLoader
train_data = TensorDataset(torch.tensor(X_train_scaled).float(), torch.tensor(y_train.values).long())
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Train the Network
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

# Prepare DataLoader for test data
test_data = TensorDataset(torch.tensor(X_test_scaled).float(), torch.tensor(y_test.values).long())
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Evaluate the Model on Test Data
model.eval()
all_preds_test = []
all_targets_test = []
with torch.no_grad():
    for inputs, targets in test_loader:
        output = model(inputs)
        _, predicted = torch.max(output.data, 1)
        all_preds_test.extend(predicted.numpy())
        all_targets_test.extend(targets.numpy())

print("Detailed Classification Report on Test Data:")
print(classification_report(all_targets_test, all_preds_test))

Epoch 1/50, Loss: 0.6799209884234837
Epoch 2/50, Loss: 0.6471615944589887
Epoch 3/50, Loss: 0.6037799971444267
Epoch 4/50, Loss: 0.5457319617271423
Epoch 5/50, Loss: 0.4770395074571882
Epoch 6/50, Loss: 0.41332034128052847
Epoch 7/50, Loss: 0.3485840473856245
Epoch 8/50, Loss: 0.3104704873902457
Epoch 9/50, Loss: 0.29070272190230234
Epoch 10/50, Loss: 0.2654046565294266
Epoch 11/50, Loss: 0.2519577273300716
Epoch 12/50, Loss: 0.23400978105408804
Epoch 13/50, Loss: 0.22520380999360765
Epoch 14/50, Loss: 0.2166804756437029
Epoch 15/50, Loss: 0.2088304257818631
Epoch 16/50, Loss: 0.19534272168363845
Epoch 17/50, Loss: 0.18324861249753407
Epoch 18/50, Loss: 0.1677611353141921
Epoch 19/50, Loss: 0.15242711773940495
Epoch 20/50, Loss: 0.14549020039183752
Epoch 21/50, Loss: 0.13257803235735213
Epoch 22/50, Loss: 0.12231102266481944
Epoch 23/50, Loss: 0.10988135210105351
Epoch 24/50, Loss: 0.10155476682952472
Epoch 25/50, Loss: 0.09116095091615405
Epoch 26/50, Loss: 0.08519478035824639
Epoch 2

# Tabular Neural Network just with stratified sampling (no custom split)

In [None]:
# Load the dataset
data = pd.read_csv('..\Dataset\Aggregated_Sleep.csv')
# Load the dataset
data = pd.read_csv('..\Dataset\Aggregated_Sleep.csv')
# Load the additional CSV file
additional_df = pd.read_csv('..\Dataset\Synthetic_Sleep_Anomaly_CTGAN.csv')
# Concatenate the original DataFrame with the new DataFrame
data = pd.concat([data, additional_df], axis=0, ignore_index=True)
# Shuffle the DataFrame
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data = data.drop(['patient_id', 'window_start'], axis=1)
data.dropna(inplace=True)

# Stratify split the data into training and testing sets
X = data.drop('agitation', axis=1)
y = data['agitation']

# Stratify sample based on the class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_counts = y_train.value_counts()
total_samples = len(y_train)
weights = [total_samples / class_counts[i] for i in range(len(class_counts))]
class_weights = torch.tensor(weights).float()

#From this point on, the code is the same as the previous example

# Define the Neural Network Architecture
class TabularModel(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_layers):
        super(TabularModel, self).__init__()
        layers = []
        for i in hidden_layers:
            layers.append(nn.Linear(num_inputs, i))
            layers.append(nn.ReLU(inplace=True))
            num_inputs = i
        layers.append(nn.Linear(num_inputs, num_outputs))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

# Instantiate the model, criterion (now weighted), and optimizer
model = TabularModel(X_train_scaled.shape[1], 2, [50, 100, 50])
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare DataLoader
train_data = TensorDataset(torch.tensor(X_train_scaled).float(), torch.tensor(y_train.values).long())
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Train the Network
epochs = 40
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

# Prepare DataLoader for test data
test_data = TensorDataset(torch.tensor(X_test_scaled).float(), torch.tensor(y_test.values).long())
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# Evaluate the Model on Test Data
model.eval()
all_preds_test = []
all_targets_test = []
with torch.no_grad():
    for inputs, targets in test_loader:
        output = model(inputs)
        _, predicted = torch.max(output.data, 1)
        all_preds_test.extend(predicted.numpy())
        all_targets_test.extend(targets.numpy())

print("Detailed Classification Report on Test Data:")
print(classification_report(all_targets_test, all_preds_test))

Epoch 1/40, Loss: 0.6857850689154404
Epoch 2/40, Loss: 0.6335163070605352
Epoch 3/40, Loss: 0.5792205104461083
Epoch 4/40, Loss: 0.5248727271190057
Epoch 5/40, Loss: 0.4788503371752225
Epoch 6/40, Loss: 0.4484001306387094
Epoch 7/40, Loss: 0.42704424949792713
Epoch 8/40, Loss: 0.4164835696036999
Epoch 9/40, Loss: 0.35882395047407883
Epoch 10/40, Loss: 0.3613038338147677
Epoch 11/40, Loss: 0.335318556198707
Epoch 12/40, Loss: 0.320053654221388
Epoch 13/40, Loss: 0.32075402255241686
Epoch 14/40, Loss: 0.302602456166194
Epoch 15/40, Loss: 0.31414736348849076
Epoch 16/40, Loss: 0.27527306916622013
Epoch 17/40, Loss: 0.28774605748745113
Epoch 18/40, Loss: 0.2809354020999028
Epoch 19/40, Loss: 0.23380409653943318
Epoch 20/40, Loss: 0.21943960718523997
Epoch 21/40, Loss: 0.21910991233128768
Epoch 22/40, Loss: 0.21519857559066552
Epoch 23/40, Loss: 0.20325964460006127
Epoch 24/40, Loss: 0.20063808732307875
Epoch 25/40, Loss: 0.1979191039617245
Epoch 26/40, Loss: 0.17425015597389296
Epoch 27/40

Results compare to our old best model because we added the synthetic data to it

## Autoencoder + One Class SVM Ensemble Model (needs testing)

In [14]:
# Load and preprocess the data
data = pd.read_csv('../Dataset/Aggregated_Sleep.csv')
data = data.drop(['patient_id', 'window_start'], axis=1)

# Drop rows with NaNs
data = data.dropna()

X = data.drop('agitation', axis=1).values
y = data['agitation'].values

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data, focusing only on normal data for training the Autoencoder
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y == 0, test_size=0.2, random_state=42)
X_train_normal = X_train[y_train]

# Define the Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, n_features):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(n_features, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2, 10),
            nn.ReLU(),
            nn.Linear(10, n_features),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Training the Autoencoder
autoencoder = Autoencoder(X_train_normal.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.01)
train_loader = DataLoader(TensorDataset(torch.tensor(X_train_normal).float()), batch_size=32, shuffle=True)

def train_autoencoder(model, loader, epochs=50):
    model.train()
    for epoch in range(epochs):
        for data in loader:
            data = data[0]  # unpack data
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

train_autoencoder(autoencoder, train_loader)

# Extract features for training and testing
autoencoder.eval()
with torch.no_grad():
    X_train_encoded = autoencoder.encoder(torch.tensor(X_train).float()).numpy()
    X_test_encoded = autoencoder.encoder(torch.tensor(X_test).float()).numpy()

# Train One-Class SVM on the encoded features
oc_svm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)
oc_svm.fit(X_train_encoded[y_train])  # Train only on normal data

y_pred_train = oc_svm.predict(X_train_encoded)
y_pred_test = oc_svm.predict(X_test_encoded)
y_pred_test = np.where(y_pred_test == 1, 0, 1)  # Converting from SVM labels to anomaly labels

print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

Epoch 1/50, Loss: 1.1442
Epoch 11/50, Loss: 0.5622
Epoch 21/50, Loss: 0.4725
Epoch 31/50, Loss: 0.8408
Epoch 41/50, Loss: 0.7065
Classification Report (Test Set):
              precision    recall  f1-score   support

       False       0.04      1.00      0.08         7
        True       1.00      0.15      0.25       178

    accuracy                           0.18       185
   macro avg       0.52      0.57      0.17       185
weighted avg       0.96      0.18      0.25       185

