In [1]:
import os
import numpy as np
import pandas as pd
import glob

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.fft import rfft
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

import kerastuner as kt
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.simplefilter(action="ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows',100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)





  import kerastuner as kt


In [2]:
path_conditions = 'data\depresjon\condition'
conditions = glob.glob(path_conditions + "/*.csv")

path_controls = 'data\depresjon\control'
controls = glob.glob(path_controls + "/*.csv")

df_scores = pd.read_csv("data\depresjon\scores.csv")

# Initialize an empty list to store DataFrames
all_list = []

# Loop through the list of csv files
for filename in conditions:
    # Read the current CSV file
    df = pd.read_csv(filename)
    
    # Optionally, extract patient_name or other identifiers from filename if needed
    # For example, if the filename contains the patient_name:
    name = os.path.splitext(os.path.basename(filename))[0]  # Adjust based on your file naming convention and operating system
    df['patient_name'] = name  # Add patient_name as a new column
    df['label'] = 1
    # Append the DataFrame to the list
    all_list.append(df)

for filename in controls:
    # Read the current CSV file
    df = pd.read_csv(filename)
    
    # Optionally, extract patient_name or other identifiers from filename if needed
    # For example, if the filename contains the patient_name:
    name = os.path.splitext(os.path.basename(filename))[0]  # Adjust based on your file naming convention and operating system
    df['patient_name'] = name  # Add patient_name as a new column
    df['label'] = 0
    # Append the DataFrame to the list
    all_list.append(df)


# Combine all DataFrames in the list into a single DataFrame
combined_df = pd.concat(all_list, ignore_index=True)
combined_df

Unnamed: 0,timestamp,date,activity,patient_name,label
0,2003-05-07 12:00:00,2003-05-07,0,condition_1,1
1,2003-05-07 12:01:00,2003-05-07,143,condition_1,1
2,2003-05-07 12:02:00,2003-05-07,0,condition_1,1
3,2003-05-07 12:03:00,2003-05-07,20,condition_1,1
4,2003-05-07 12:04:00,2003-05-07,166,condition_1,1
...,...,...,...,...,...
1571701,2003-12-01 12:53:00,2003-12-01,7,control_9,0
1571702,2003-12-01 12:54:00,2003-12-01,7,control_9,0
1571703,2003-12-01 12:55:00,2003-12-01,5,control_9,0
1571704,2003-12-01 12:56:00,2003-12-01,5,control_9,0


### Data pre-processing
Feature engineering for time-series or activity data before applying a CNN can involve several steps, aiming to highlight aspects of the data that are indicative of the outcome you're trying to predict (in this case, depression). Here are some common feature engineering techniques for time-series data:

**Normalization/Standardization:** As CNNs can be sensitive to the scale of the input data, it's standard practice to normalize or standardize your features so that they're on a similar scale. This usually means subtracting the mean and dividing by the standard deviation (Z-score normalization).

**Windowing:** If your dataset consists of continuous time-series data, you might want to split it into smaller fixed-size sequences or windows. This is especially relevant if you're dealing with long sequences.

**Denoising:** If the data is noisy, applying a smoothing filter or a denoising algorithm can help to reduce noise and make patterns more discernible.

**Feature Extraction:** Extracting statistical features from time windows can be useful, especially if there's a risk of losing important information through windowing alone. Common features include the mean, standard deviation, skewness, kurtosis, and higher-order moments of the data within a window.

**Fourier Transforms or Spectral Analysis:** For data with periodic features, applying a Fourier transform can help to identify dominant frequency components, which might be relevant for distinguishing between different states, such as depressed vs. non-depressed.

**Wavelet Transforms:** These can be used to decompose time-series data into time-frequency space, capturing both temporal and frequency information.

**Domain-Specific Features:** Depending on the specifics of your activity data, there may be particular measures that are relevant to depression. For instance, the amount of activity or the variability of activity levels could be significant.

**Dimensionality Reduction:** Techniques like PCA (Principal Component Analysis) can be applied to reduce the dimensionality of the feature space while retaining most of the variability in the data.

**Correlation Analysis:** Analyzing the correlation between different features can provide insights and help in selecting the most relevant features for your model.

**Balancing the Dataset:** If the dataset is imbalanced with respect to the target classes, you might want to consider resampling techniques to balance the classes.

In [3]:
# Clean and preprocess the dataset
# Ensuring no NaN/infinite values in the dataset
combined_df.fillna(0, inplace=True)
combined_df.replace([np.inf, -np.inf], 0, inplace=True)

# Normalize the activity data
scaler = StandardScaler()
combined_df['activity_normalized'] = scaler.fit_transform(combined_df[['activity']])

# Function to segment the data
def segment_data(df, segment_size=60):
    segments = []
    labels = []
    for name in df['patient_name'].unique():
        patient_data = df[df['patient_name'] == name]
        for start_pos in range(0, len(patient_data) - segment_size + 1, segment_size):
            segment = patient_data['activity_normalized'].iloc[start_pos:start_pos + segment_size].values
            label = patient_data['label'].iloc[start_pos]  # Assuming label is constant within each segment
            segments.append(segment)
            labels.append(label)
    return np.array(segments), np.array(labels)

# Segmenting the data
segment_size = 60  # Example segment size
segments, labels = segment_data(combined_df, segment_size)

In [4]:
# Function to calculate the Fourier Transform of each segment
def calculate_fourier_transform(segment):
    # Compute the real Fast Fourier Transform
    fft_values = rfft(segment)
    # Compute the absolute values (magnitudes) of the FFT
    fft_magnitude = np.abs(fft_values)
    return fft_magnitude

# Function to extract statistical features from each segment
def extract_statistical_features(segment):
    features = {
        'mean': np.mean(segment),
        'std': np.std(segment),
        'skew': skew(segment),
        'kurtosis': kurtosis(segment)
    }
    return np.array(list(features.values()))

# Function to process each segment, combining FFT and statistical features
def process_segment(segment):
    fft_magnitude = calculate_fourier_transform(segment)
    statistical_features = extract_statistical_features(segment)
    # Combine FFT magnitudes with statistical features
    combined_features = np.concatenate([fft_magnitude, statistical_features])
    return combined_features

# Apply feature extraction to each segment
features = np.array([process_segment(segment) for segment in segments])

# Since the feature extraction might change the shape, let's check and adjust the data shape for CNN input
print(features.shape)


(26166, 35)


In [40]:

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


# Adjusting the shape for CNN input
X_train = X_train.reshape((-1, X_train.shape[1], 1))
X_test = X_test.reshape((-1, X_test.shape[1], 1))

# At this point, the data is ready to be fed into a CNN model.

In [41]:
# Calculate class weights to address imbalance
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))
    
def model_builder(hp):
    model = Sequential()
    model.add(Conv1D(filters=hp.Int('filters', min_value=16, max_value=64, step=16),
                     kernel_size=hp.Choice('kernel_size', values=[3, 5]),
                     activation='relu',
                     input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Flatten())
    model.add(Dense(units=hp.Int('units', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Include gradient clipping in the optimizer
    optimizer = Adam(
        hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]),
        clipnorm=hp.Choice('clipnorm', values=[0.5, 1.0, 1.5])
    )
    
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', Precision(), Recall()])
    
    return model

# Initialize the tuner
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=5,
                     directory='my_dir',
                     project_name='intro_to_kt')

# Create a callback to stop training early after reaching a certain value for the validation loss
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)




Reloading Tuner from my_dir\intro_to_kt\tuner0.json


In [42]:
# Execute the hyperparameter search
tuner.search(X_train, y_train, epochs=5, validation_split=0.2, callbacks=[stop_early], class_weight=class_weight_dict)

# Retrieve the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)

# Train the best model
history = model.fit(X_train, y_train, epochs=5, validation_split=0.2, class_weight=class_weight_dict)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
# Evaluate the model
eval_result = model.evaluate(X_test, y_test)
print("[test loss, test accuracy, test precision, test recall, test AUC]:", eval_result)

# Predicting and evaluating using custom thresholds if necessary
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Computing success metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

[test loss, test accuracy, test precision, test recall, test AUC]: [nan, 0.6436759829521179, 0.0, 0.0]
Accuracy: 0.6436759648452427
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      3369
           1       0.00      0.00      0.00      1865

    accuracy                           0.64      5234
   macro avg       0.32      0.50      0.39      5234
weighted avg       0.41      0.64      0.50      5234

