In [10]:
%pip install lightgbm
%pip install pytorch_tabular

Note: you may need to restart the kernel to use updated packages.
Collecting pytorch_tabular
  Downloading pytorch_tabular-1.1.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning<2.2.0,>=2.0.0 (from pytorch_tabular)
  Downloading pytorch_lightning-2.1.4-py3-none-any.whl.metadata (21 kB)
Collecting omegaconf>=2.3.0 (from pytorch_tabular)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting torchmetrics<1.3.0,>=0.10.0 (from pytorch_tabular)
  Downloading torchmetrics-1.2.1-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard!=2.5.0,>2.2.0 (from pytorch_tabular)
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting protobuf<4.26.0,>=3.20.0 (from pytorch_tabular)
  Downloading protobuf-4.25.3-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting pytorch-tabnet==4.1 (from pytorch_tabular)
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting PyYAML<6.1.0,>=5.4 (from pytorch_tabular)
  Downloadi

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch Tabular for the neural network
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, ModelConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models import CategoryEmbeddingModelConfig

In [2]:
# Load the dataset
data = pd.read_csv('../Dataset/dataset_final.csv')

# Drop unnecessary columns
data = data.drop(['patient_id', 'window_period', 'Sleep_Agitation', 'Physio_Agitation'], axis=1)
#drop redundant rows
data = data.drop_duplicates()

In [3]:
data.shape

(570, 17)

In [4]:
data[data['Agitation'] == 1].shape[0] / data.shape[0]

0.2912280701754386

In [7]:
# Separate features and target
X = data.drop('Agitation', axis=1)
y = data['Agitation']

# Split the data into train, validation, and test sets with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)  # Split off the test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)  # Split the remainder into validation and test

# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=(1 - y.mean()), use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(is_unbalance=True),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42, class_weight='balanced')
}

results_df = pd.DataFrame(columns=['Model Name', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'])

# Train and evaluate classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else None
    result = {
        'Model Name': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else 'N/A'
    }
    result_df = pd.DataFrame([result])
    results_df = pd.concat([results_df, result_df], ignore_index=True)

# Print results in a tabular format
print(results_df.to_string(index=False))

  results_df = pd.concat([results_df, result_df], ignore_index=True)


[LightGBM] [Info] Number of positive: 100, number of negative: 242
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1276
[LightGBM] [Info] Number of data points in the train set: 342, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.292398 -> initscore=-0.883768
[LightGBM] [Info] Start training from score -0.883768
       Model Name  Accuracy  Precision   Recall  F1 Score  ROC AUC
    Random Forest  0.973684   0.941176 0.969697  0.955224 0.997942
Gradient Boosting  0.991228   0.970588 1.000000  0.985075 1.000000
          XGBoost  0.956140   0.888889 0.969697  0.927536 0.981294
         LightGBM  0.956140   0.888889 0.969697  0.927536 0.980546
      Extra Trees  0.991228   0.970588 1.000000  0.985075 0.999626


In [8]:
results_df

Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Random Forest,0.973684,0.941176,0.969697,0.955224,0.997942
1,Gradient Boosting,0.991228,0.970588,1.0,0.985075,1.0
2,XGBoost,0.95614,0.888889,0.969697,0.927536,0.981294
3,LightGBM,0.95614,0.888889,0.969697,0.927536,0.980546
4,Extra Trees,0.991228,0.970588,1.0,0.985075,0.999626


In [19]:
# Define the stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Create an empty DataFrame to store results
k_fold_results_df = pd.DataFrame(columns=['Model', 'Mean F1 Score after 10-fold CV'])

# Iterate over each classifier
for model_name, classifier in classifiers.items():
    # Perform cross-validation
    cv_scores = cross_val_score(classifier, X, y, cv=skf, scoring='f1')
    
    # Store results in the DataFrame
    new_row = pd.DataFrame({
        'Model': [model_name],
        'Mean F1 Score after 10-fold CV': [cv_scores.mean()]
    })
    k_fold_results_df = pd.concat([k_fold_results_df, new_row], ignore_index=True)

# Display the results
print(k_fold_results_df)

  k_fold_results_df = pd.concat([k_fold_results_df, new_row], ignore_index=True)


[LightGBM] [Info] Number of positive: 150, number of negative: 363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1802
[LightGBM] [Info] Number of data points in the train set: 513, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.292398 -> initscore=-0.883768
[LightGBM] [Info] Start training from score -0.883768
[LightGBM] [Info] Number of positive: 150, number of negative: 363
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 513, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.292398 -> initscore=-0.883768
[LightGBM] [Info] Start training from score -0.883768
[LightGBM] [Info] Number

# Tabular Neural Network

In [13]:
# Adding a Tabular Neural Network
data_config = DataConfig(
    target=['Agitation'],
    continuous_cols=X_train.columns.tolist(),
    categorical_cols=[],
)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="512-256-128",
    activation="ReLU",
)

trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=64,
    max_epochs=50,
    gpus=0,
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=OptimizerConfig(),
    trainer_config=trainer_config,
    experiment_config=None
)

# Fit the model using training and validation sets
tabular_model.fit(train=X_train, validation=X_val)

TypeError: TrainerConfig.__init__() got an unexpected keyword argument 'gpus'

In [None]:
# Evaluate on the test set
evaluation = tabular_model.evaluate(X_test)
predictions = tabular_model.predict(X_test)
predictions_proba = tabular_model.predict_proba(X_test)

# Adding results of the Tabular Neural Network
results['Tabular Neural Network'] = {
    'accuracy': accuracy_score(y_test, predictions['prediction']),
    'precision': precision_score(y_test, predictions['prediction']),
    'recall': recall_score(y_test, predictions['prediction']),
    'f1_score': f1_score(y_test, predictions['prediction']),
    'roc_auc': roc_auc_score(y_test, predictions_proba[:, 1])
}
print(f"Tabular Neural Network - Accuracy: {results['Tabular Neural Network']['accuracy']}, ROC AUC: {results['Tabular Neural Network']['roc_auc']}, F1 score: {results['Tabular Neural Network']['f1_score']}, Precision: {results['Tabular Neural Network']['precision']}, Recall: {results['Tabular Neural Network']['recall']}")