In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import csv

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
survey_df = pd.read_csv('survey.csv')

In [None]:
survey_df.head()

In [None]:
mean_magnitude_per_week_sleeping = pd.read_csv('mean_magnitude_per_week.csv')
mean_magnitude_per_week = pd.read_csv('mean_magnitude_per_week.csv')

In [None]:
print(mean_magnitude_per_week.head())
mean_magnitude_per_week_sleeping.head()

In [None]:
unique_participant_count = mean_magnitude_per_week['id'].nunique()
print("Count of unique participant_id:", unique_participant_count)

In [None]:
unique_participant_count = mean_magnitude_per_week_sleeping['id'].nunique()
print("Count of unique participant_id:", unique_participant_count)

## Machine learning

In [None]:
merged_df = pd.merge(mean_magnitude_per_week, mean_magnitude_per_week_sleeping, on=['id', 'week'], how='left')
merged_df['magnitude0-7'] = merged_df['magnitude_y']

merged_df = merged_df.drop(['magnitude_y'], axis=1)
merged_df = merged_df.rename(columns={'magnitude_x': 'magnitude'})

# Fill NaN values with the mean
merged_df['magnitude0-7'] = merged_df['magnitude0-7'].fillna(merged_df.groupby('id')['magnitude'].transform('mean'))

print(merged_df.head())
print(merged_df.shape)

In [None]:
final_df = pd.merge(survey_df, merged_df, left_on=['patientId'], right_on=['id'], how='inner')

final_df = final_df.drop(['BP_date', 'pp6_13_EPDS_R', 'week', 'patientId'], axis=1)

final_df.shape

In [None]:
X = final_df.drop(['depression_flag'], axis=1)
y = final_df['depression_flag']

# Split the data into training and testing sets based on 'id'
unique_ids = X['id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)

X_train = X[X['id'].isin(train_ids)]
y_train = y[X['id'].isin(train_ids)]

X_test = X[X['id'].isin(test_ids)]
y_test = y[X['id'].isin(test_ids)]

# Initialize GroupKFold based on 'id' for outer cross-validation
group_kfold_outer = GroupKFold(n_splits=5)
# Extract the 'id' column from X_train for grouping
groups_outer = X_train['id']

# Initialize different classifiers
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

### Random forest

In [None]:
# Initialize the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize TimeSeriesSplit for inner cross-validation
tscv_inner = TimeSeriesSplit(n_splits=5)

# Initialize GridSearchCV for Random Forest with nested cross-validation
grid_search_rf = GridSearchCV(rf_clf, param_grid_rf, cv=tscv_inner.split(X_train[['magnitude', 'magnitude0-7']], y_train, groups=groups_outer), scoring='accuracy')

# Fit the grid search to the data for Random Forest with nested cross-validation
grid_search_rf.fit(X_train[['magnitude', 'magnitude0-7']], y_train)

# Get the best parameters from the grid search results
best_rf_params = grid_search_rf.best_params_

# Initialize the Random Forest classifier with the best parameters and balanced class_weight
final_rf_clf = RandomForestClassifier(random_state=42, **best_rf_params)

# Train the final model
final_rf_clf.fit(X_train[['magnitude', 'magnitude0-7']], y_train)

# Evaluate the final model on the test set
X_test = X[X['id'].isin(test_ids)]
y_test = y[X['id'].isin(test_ids)]

y_pred_test = final_rf_clf.predict(X_test[['magnitude', 'magnitude0-7']])

# Print accuracy and classification report on the test set
print("Final Random Forest Model - Test Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Final Random Forest Model - Test Classification Report:")
print(classification_report(y_test, y_pred_test))

In [None]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Plot confusion matrix
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Random Forest')
plt.colorbar()

classes = ['Negative', 'Positive']
tick_marks = range(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

plt.xlabel('Predicted')
plt.ylabel('Actual')

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='white' if cm[i, j] > cm.max() / 2 else 'black')

plt.show()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities for the positive class
y_pred_proba = final_rf_clf.predict_proba(X_test[['magnitude', 'magnitude0-7']])[:, 1]

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve - Random Forest')
plt.legend()
plt.show()

In [None]:
# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Random Forest')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.show()

### Gradient Boosting

In [None]:
# Initialize the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize TimeSeriesSplit for inner cross-validation
tscv_inner = TimeSeriesSplit(n_splits=5)

# Initialize GridSearchCV for Gradient Boosting with nested cross-validation
grid_search_gb = GridSearchCV(gb_clf, param_grid_gb, cv=tscv_inner.split(X_train[['magnitude', 'magnitude0-7']], y_train, groups=groups_outer), scoring='accuracy')

# Fit the grid search to the data for Gradient Boosting with nested cross-validation
grid_search_gb.fit(X_train[['magnitude', 'magnitude0-7']], y_train)

# Get the best parameters from the grid search results
best_gb_params = grid_search_gb.best_params_

# Initialize the Gradient Boosting classifier with the best parameters
final_gb_clf = GradientBoostingClassifier(random_state=42, **best_gb_params)

# Train the final model
final_gb_clf.fit(X_train[['magnitude', 'magnitude0-7']], y_train)

# Evaluate the final model on the test set
y_pred_test_gb = final_gb_clf.predict(X_test[['magnitude', 'magnitude0-7']])

# Print accuracy and classification report on the test set for Gradient Boosting
print("Final Gradient Boosting Model - Test Accuracy: ", accuracy_score(y_test, y_pred_test_gb))
print("Final Gradient Boosting Model - Test Classification Report:")
print(classification_report(y_test, y_pred_test_gb))

In [None]:
# Calculate confusion matrix
cm_gb = confusion_matrix(y_test, y_pred_test_gb)

# Plot confusion matrix
plt.imshow(cm_gb, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Gradient Boosting')
plt.colorbar()

classes_gb = ['Negative', 'Positive']
tick_marks_gb = range(len(classes_gb))
plt.xticks(tick_marks_gb, classes_gb, rotation=45)
plt.yticks(tick_marks_gb, classes_gb)

plt.xlabel('Predicted')
plt.ylabel('Actual')

for i in range(len(classes_gb)):
    for j in range(len(classes_gb)):
        plt.text(j, i, str(cm_gb[i, j]), ha='center', va='center', color='white' if cm_gb[i, j] > cm_gb.max() / 2 else 'black')

plt.show()

In [None]:

# Get predicted probabilities for the positive class
y_pred_proba_gb = final_gb_clf.predict_proba(X_test[['magnitude', 'magnitude0-7']])[:, 1]

# Calculate ROC curve
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_pred_proba_gb)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_gb, tpr_gb, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve - Gradient Boosting')
plt.legend()
plt.show()

In [None]:
# Calculate precision-recall curve
precision_gb, recall_gb, _ = precision_recall_curve(y_test, y_pred_proba_gb)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.step(recall_gb, precision_gb, color='b', alpha=0.2, where='post')
plt.fill_between(recall_gb, precision_gb, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Gradient Boosting')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.show()