In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report,mean_squared_error, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedBaggingClassifier
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from statsmodels.discrete.discrete_model import MNLogit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [2]:
# put the dataset which comes from long_dataset_feature_engineering.ipynb 
file_path = '/data/caysar9/results/final_long.csv'
df = pd.read_csv(file_path)

In [3]:
# Define features and target
features = ['trigger_lack_physical_activity','trigger_physical_activity','trigger_poor_sleep','trigger_stress','sleep_duration_hours','sleep_duration_past_7_days','age','gender_encoded','migraine_attacks_past7days','mean_migraine_duration_past7days']


X = df[features]
y = df['severe_migraine']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Frequency counts before applying SMOTE
print("Frequency counts before SMOTE:")
print("Frequency counts train set")
print(y_train.value_counts())
print("Frequency counts test set")
print(y_test.value_counts())

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Frequency counts before applying SMOTE
print("Frequency counts after SMOTE:")
print("Frequency counts train set")
print(y_train_balanced.value_counts())

Frequency counts before SMOTE:
Frequency counts train set
severe_migraine
0    40897
1     4731
Name: count, dtype: int64
Frequency counts test set
severe_migraine
0    10224
1     1183
Name: count, dtype: int64
Frequency counts after SMOTE:
Frequency counts train set
severe_migraine
0    40897
1    40897
Name: count, dtype: int64


In [7]:
# Define the hyperparameter distributions
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization methods
    'C': uniform(0.01, 10),  # Continuous uniform distribution for regularization strength
    'solver': ['liblinear', 'saga'],  # solvers
    'max_iter': randint(100, 1000)  # Randomly sample number of iterations
}

# Initialize the Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


36 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/caysar9/venv-py311/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^

Best Hyperparameters: {'C': 0.017787658410143285, 'max_iter': 376, 'penalty': 'l1', 'solver': 'saga'}

AUC Score: 0.8022

Confusion Matrix:
[[8235 1989]
 [ 410  773]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.81      0.87     10224
           1       0.28      0.65      0.39      1183

    accuracy                           0.79     11407
   macro avg       0.62      0.73      0.63     11407
weighted avg       0.88      0.79      0.82     11407



In [8]:
X_train_balanced_const = sm.add_constant(X_train_balanced)

# Fit the logistic regression model on the balanced data
logit_model = sm.Logit(y_train_balanced, X_train_balanced_const).fit()

# Display the summary of the model
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.482364
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:        severe_migraine   No. Observations:                81794
Model:                          Logit   Df Residuals:                    81783
Method:                           MLE   Df Model:                           10
Date:                Sat, 07 Dec 2024   Pseudo R-squ.:                  0.3041
Time:                        20:04:05   Log-Likelihood:                -39455.
converged:                       True   LL-Null:                       -56695.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const                               -1.5145      0.072    -21.02

In [9]:
# Define the hyperparameter distributions
param_distributions = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Splitting criteria
    'max_depth': randint(5, 50),  # Randomly sample tree depth between 5 and 50
    'min_samples_split': randint(2, 20),  # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 10),  # Minimum samples required in a leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider for best split
}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 26, 'max_features': None, 'min_samples_leaf': 9, 'min_samples_split': 18}

AUC Score: 0.7216

Confusion Matrix:
[[8452 1772]
 [ 671  512]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.87     10224
           1       0.22      0.43      0.30      1183

    accuracy                           0.79     11407
   macro avg       0.58      0.63      0.58     11407
weighted avg       0.85      0.79      0.81     11407



In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from scipy.stats import randint

# Define the hyperparameter distributions
param_distributions = {
    'n_estimators': randint(50, 300),  # Randomly sample between 50 and 300 trees
    'max_depth': [None, 10, 20, 30],  # Fixed choices for tree depth
    'min_samples_split': randint(2, 11),  # Randomly sample between 2 and 10 for splitting
    'min_samples_leaf': randint(1, 5),  # Randomly sample between 1 and 4 for leaf nodes
    'max_features': ['sqrt', 'log2', None],  # Fixed options for max features
    'bootstrap': [True, False]  # Fixed options for bootstrapping
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Best Hyperparameters: {'bootstrap': False, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 267}

AUC Score: 0.7934

Confusion Matrix:
[[9106 1118]
 [ 762  421]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91     10224
           1       0.27      0.36      0.31      1183

    accuracy                           0.84     11407
   macro avg       0.60      0.62      0.61     11407
weighted avg       0.86      0.84      0.84     11407



In [11]:
# Get feature importances from the trained Random Forest model
feature_importances = best_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_train_balanced.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n")
print(feature_importance_df)


Feature Importances:

                            Feature  Importance
9  mean_migraine_duration_past7days    0.481145
6                               age    0.142572
4              sleep_duration_hours    0.121943
5        sleep_duration_past_7_days    0.118380
8        migraine_attacks_past7days    0.059767
2                trigger_poor_sleep    0.023258
0    trigger_lack_physical_activity    0.015921
3                    trigger_stress    0.015567
7                    gender_encoded    0.014542
1         trigger_physical_activity    0.006905


In [12]:
# Define the hyperparameter distributions
param_distributions = {
    'n_estimators': randint(50, 200),  # Number of boosting stages
    'learning_rate': uniform(0.01, 0.3),  # Learning rate for boosting
    'max_depth': randint(3, 10),  # Maximum depth of individual estimators
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 5),  # Minimum number of samples required to be a leaf node
    'subsample': uniform(0.7, 0.3),  # Fraction of samples used for fitting each base learner
    'max_features': ['sqrt', 'log2', None]  # Number of features to consider at each split
}

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 30 candidates, totalling 90 fits


Best Hyperparameters: {'learning_rate': 0.268219174976903, 'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 186, 'subsample': 0.8454489914076949}

AUC Score: 0.7921

Confusion Matrix:
[[9201 1023]
 [ 741  442]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91     10224
           1       0.30      0.37      0.33      1183

    accuracy                           0.85     11407
   macro avg       0.61      0.64      0.62     11407
weighted avg       0.86      0.85      0.85     11407



In [13]:
# Get feature importances from the trained Gradient boosting model
feature_importances = best_model.feature_importances_

# Create a DataFrame to display the feature importances alongside feature names
feature_importance_df = pd.DataFrame({
    'Feature': X_train_balanced.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importance table
print("\nFeature Importances:\n")
print(feature_importance_df)


Feature Importances:

                            Feature  Importance
9  mean_migraine_duration_past7days    0.522913
6                               age    0.147251
8        migraine_attacks_past7days    0.085479
4              sleep_duration_hours    0.076219
5        sleep_duration_past_7_days    0.074679
2                trigger_poor_sleep    0.032856
0    trigger_lack_physical_activity    0.019874
3                    trigger_stress    0.018092
7                    gender_encoded    0.015499
1         trigger_physical_activity    0.007139


In [14]:
# Define the hyperparameter distributions
param_distributions = {
    'n_neighbors': randint(1, 50),  # Number of neighbors to test
    'weights': ['uniform', 'distance'],  # Weighting strategies
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics
    'p': randint(1, 3)  # distance parameter 
}

# Initialize the K-Nearest Neighbors model
knn_model = KNeighborsClassifier()

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',  # Evaluate based on AUC
    cv=3,  # 3-fold cross-validation
    verbose=1,  # Show progress
    n_jobs=-1,  # Use all available cores
    random_state=42  # For reproducibility
)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and hyperparameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Train the best model on the balanced training set
best_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate and display AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}

AUC Score: 0.7579
Confusion Matrix:
[[8198 2026]
 [ 528  655]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.80      0.87     10224
           1       0.24      0.55      0.34      1183

    accuracy                           0.78     11407
   macro avg       0.59      0.68      0.60     11407
weighted avg       0.87      0.78      0.81     11407



In [15]:
'''from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scikeras.wrappers import KerasClassifier
import numpy as np

# Define features and target
features = ['trigger_lack_physical_activity','trigger_physical_activity','trigger_poor_sleep','trigger_stress','sleep_duration_hours','sleep_duration_past_7_days','age','gender_encoded','migraine_attacks_past7days','mean_migraine_duration_past7days']

X = df[features]
y = df['severe_migraine']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Define a function to create the model
def create_model(learning_rate=0.001, neurons=[64, 32, 16]):
    model = Sequential()
    model.add(Dense(neurons[0], activation='relu', input_shape=(X_train_scaled.shape[1],)))
    model.add(Dense(neurons[1], activation='relu'))
    model.add(Dense(neurons[2], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model for compatibility with RandomizedSearchCV
keras_clf = KerasClassifier(model=create_model, verbose=0)

# Define hyperparameter distributions
param_distributions = {
    "model__learning_rate": [0.001, 0.01, 0.1],
    "model__neurons": [[64, 32, 16], [128, 64, 32], [32, 16, 8]],
    "batch_size": [16, 32, 64],
    "epochs": [20, 50, 100]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=keras_clf,
    param_distributions=param_distributions,
    n_iter=10,  # Number of random combinations to try
    scoring='roc_auc',  # Focus on auc for hyperparameter tuning
    cv=3,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Use all available cores
    random_state=42
)

# Perform hyperparameter tuning
random_search.fit(X_train_scaled, y_train_smote)

# Get the best model and parameters
best_model = random_search.best_estimator_
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model on the test set
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for the positive class
y_pred_classes = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary classes

# Calculate AUC
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC Score: {auc_score:.4f}")

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))
'''

'\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense\nfrom tensorflow.keras.optimizers import Adam\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.metrics import roc_auc_score, confusion_matrix, classification_report\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom imblearn.over_sampling import SMOTE\nfrom scikeras.wrappers import KerasClassifier\nimport numpy as np\n\n# Define features and target\nfeatures = [\'trigger_lack_physical_activity\',\'trigger_physical_activity\',\'trigger_poor_sleep\',\'trigger_stress\',\'sleep_duration_hours\',\'sleep_duration_past_7_days\',\'age\',\'gender_encoded\',\'migraine_attacks_past7days\',\'mean_migraine_duration_past7days\']\n\nX = df[features]\ny = df[\'affected_activity_QoL\']\n\n# Split data into training and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n\