In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report,mean_squared_error, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedBaggingClassifier
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from statsmodels.discrete.discrete_model import MNLogit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [2]:
# put the dataset which comes from short_dataset_feature_engineering.ipynb 
file_path = '/data/caysar9/results/final_short.csv'
df = pd.read_csv(file_path)

In [3]:
# Define features and target for affected activities QoL
features = ['migraine_days_per_month', 'painintensity', 'duration_in_hours', 'trigger_stress', 'trigger_poor_sleep','sleep_duration_hours','sleep_duration_past_7_days','migraine_attacks_past7days','mean_migraine_duration_past7days','reported_anxiety', 'reported_depression']

X = df[features]
y = df['affected_activity_QoL']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Frequency counts before applying SMOTE
print("Frequency counts before SMOTE:")
print("Frequency counts train set")
print(y_train.value_counts())
print("Frequency counts test set")
print(y_test.value_counts())

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Frequency counts before applying SMOTE
print("Frequency counts after SMOTE:")
print("Frequency counts train set")
print(y_train_balanced.value_counts())

Frequency counts before SMOTE:
Frequency counts train set
affected_activity_QoL
0    7038
1    4483
Name: count, dtype: int64
Frequency counts test set
affected_activity_QoL
0    1760
1    1121
Name: count, dtype: int64
Frequency counts after SMOTE:
Frequency counts train set
affected_activity_QoL
1    7038
0    7038
Name: count, dtype: int64


In [13]:
# Define the hyperparameter distributions
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization methods
    'C': uniform(0.01, 10),  # Continuous uniform distribution for regularization strength
    'solver': ['liblinear', 'saga'],  # solvers
    'max_iter': randint(100, 1000)  # Randomly sample number of iterations
}

# Initialize the Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


36 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^

Best Hyperparameters: {'C': 0.46227288910538067, 'max_iter': 395, 'penalty': 'l1', 'solver': 'saga'}

AUC Score: 0.7052

Confusion Matrix:
[[1149  611]
 [ 397  724]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.65      0.70      1760
           1       0.54      0.65      0.59      1121

    accuracy                           0.65      2881
   macro avg       0.64      0.65      0.64      2881
weighted avg       0.67      0.65      0.65      2881



In [5]:
X_train_balanced_const = sm.add_constant(X_train_balanced)

# Fit the logistic regression model on the balanced data
logit_model = sm.Logit(y_train_balanced, X_train_balanced_const).fit()

# Display the summary of the model
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.648446
         Iterations 5
                             Logit Regression Results                            
Dep. Variable:     affected_activity_QoL   No. Observations:                14076
Model:                             Logit   Df Residuals:                    14064
Method:                              MLE   Df Model:                           11
Date:                   Sat, 07 Dec 2024   Pseudo R-squ.:                 0.06449
Time:                           17:28:21   Log-Likelihood:                -9127.5
converged:                          True   LL-Null:                       -9756.7
Covariance Type:               nonrobust   LLR p-value:                4.115e-263
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const                               -1.8

In [14]:
# Define the hyperparameter distributions
param_distributions = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Splitting criteria
    'max_depth': randint(5, 50),  # Randomly sample tree depth between 5 and 50
    'min_samples_split': randint(2, 20),  # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 10),  # Minimum samples required in a leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider for best split
}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Hyperparameters: {'criterion': 'log_loss', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8}

AUC Score: 0.6895

Confusion Matrix:
[[1129  631]
 [ 401  720]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.64      0.69      1760
           1       0.53      0.64      0.58      1121

    accuracy                           0.64      2881
   macro avg       0.64      0.64      0.63      2881
weighted avg       0.66      0.64      0.65      2881



In [8]:
# Define the hyperparameter distributions
param_distributions = {
    'n_estimators': randint(50, 300),  # Randomly sample between 50 and 300 trees
    'max_depth': [None, 10, 20, 30],  # Fixed choices for tree depth
    'min_samples_split': randint(2, 11),  # Randomly sample between 2 and 10 for splitting
    'min_samples_leaf': randint(1, 5),  # Randomly sample between 1 and 4 for leaf nodes
    'max_features': ['sqrt', 'log2', None],  # Fixed options for max features
    'bootstrap': [True, False]  # Fixed options for bootstrapping
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Best Hyperparameters: {'bootstrap': False, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 267}

AUC Score: 0.6610

Confusion Matrix:
[[1241  519]
 [ 559  562]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1760
           1       0.52      0.50      0.51      1121

    accuracy                           0.63      2881
   macro avg       0.60      0.60      0.60      2881
weighted avg       0.62      0.63      0.62      2881



In [10]:
# Feature Importance
feature_importances = best_model.feature_importances_
feature_names = X_train_balanced.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n")
print(importance_df)


Feature Importances:

                             Feature  Importance
6         sleep_duration_past_7_days    0.167812
2                  duration_in_hours    0.164579
8   mean_migraine_duration_past7days    0.163901
5               sleep_duration_hours    0.163865
1                      painintensity    0.133761
0            migraine_days_per_month    0.072566
7         migraine_attacks_past7days    0.049810
4                 trigger_poor_sleep    0.028270
9                   reported_anxiety    0.024321
3                     trigger_stress    0.016409
10               reported_depression    0.014706


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform,randint


# Define the hyperparameter distributions
param_distributions = {
    'n_estimators': randint(50, 200),  # Number of boosting stages
    'learning_rate': uniform(0.01, 0.3),  # Learning rate for boosting
    'max_depth': randint(3, 10),  # Maximum depth of individual estimators
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 5),  # Minimum number of samples required to be a leaf node
    'subsample': uniform(0.7, 0.3),  # Fraction of samples used for fitting each base learner
    'max_features': ['sqrt', 'log2', None]  # Number of features to consider at each split
}

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 30 candidates, totalling 90 fits


Best Hyperparameters: {'learning_rate': 0.17838315927084888, 'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 186, 'subsample': 0.9131988669057362}

AUC Score: 0.6552

Confusion Matrix:
[[1252  508]
 [ 551  570]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1760
           1       0.53      0.51      0.52      1121

    accuracy                           0.63      2881
   macro avg       0.61      0.61      0.61      2881
weighted avg       0.63      0.63      0.63      2881



In [15]:
# Feature Importance
feature_importances = best_model.feature_importances_
feature_names = X_train_balanced.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n")
print(importance_df)


Feature Importances:

                             Feature  Importance
1                      painintensity    0.460308
4                 trigger_poor_sleep    0.182021
9                   reported_anxiety    0.106767
8   mean_migraine_duration_past7days    0.078362
5               sleep_duration_hours    0.047898
6         sleep_duration_past_7_days    0.032237
2                  duration_in_hours    0.029339
0            migraine_days_per_month    0.025729
3                     trigger_stress    0.014883
10               reported_depression    0.014712
7         migraine_attacks_past7days    0.007744


In [16]:
# Define the hyperparameter distributions
param_distributions = {
    'n_neighbors': randint(1, 50),  # Number of neighbors to test
    'weights': ['uniform', 'distance'],  # Weighting strategies
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics
    'p': randint(1, 3)  # distance parameter 
}

# Initialize the K-Nearest Neighbors model
knn_model = KNeighborsClassifier()

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}

AUC Score: 0.5931
Confusion Matrix:
[[1109  651]
 [ 567  554]]

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.63      0.65      1760
           1       0.46      0.49      0.48      1121

    accuracy                           0.58      2881
   macro avg       0.56      0.56      0.56      2881
weighted avg       0.58      0.58      0.58      2881



In [None]:
'''from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scikeras.wrappers import KerasClassifier
import numpy as np

# Define features and target
features = ['migraine_days_per_month', 'painintensity', 'duration_in_hours', 'trigger_stress', 'trigger_poor_sleep',
            'sleep_duration_hours', 'sleep_duration_past_7_days', 'migraine_attacks_past7days',
            'mean_migraine_duration_past7days', 'reported_anxiety', 'reported_depression']

X = df[features]
y = df['affected_activity_QoL']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Define a function to create the model
def create_model(learning_rate=0.001, neurons=[64, 32, 16]):
    model = Sequential()
    model.add(Dense(neurons[0], activation='relu', input_shape=(X_train_scaled.shape[1],)))
    model.add(Dense(neurons[1], activation='relu'))
    model.add(Dense(neurons[2], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model for compatibility with RandomizedSearchCV
keras_clf = KerasClassifier(model=create_model, verbose=0)

# Define hyperparameter distributions
param_distributions = {
    "model__learning_rate": [0.001, 0.01, 0.1],
    "model__neurons": [[64, 32, 16], [128, 64, 32], [32, 16, 8]],
    "batch_size": [16, 32, 64],
    "epochs": [20, 50, 100]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=keras_clf,
    param_distributions=param_distributions,
    n_iter=10,  # Number of random combinations to try
    scoring='roc_auc',  # Focus on auc for hyperparameter tuning
    cv=3,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Use all available cores
    random_state=42
)

# Perform hyperparameter tuning
random_search.fit(X_train_scaled, y_train_smote)

# Get the best model and parameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model on the test set
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for the positive class
y_pred_classes = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary classes

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))
'''