In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from IPython.display import Markdown, display
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_recall_curve, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from interpret.glassbox import (LogisticRegression,
                                ClassificationTree,
                                ExplainableBoostingClassifier)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from interpret.blackbox import LimeTabular
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss, TomekLinks
from imblearn.combine import SMOTETomek
from interpret import show
import shap
%matplotlib inline

In [4]:
df = pd.read_csv('../csv/df.csv',index_col=0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5372 entries, 73557 to 83727
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gender                        5372 non-null   object 
 1   age                           5372 non-null   int64  
 2   race                          5372 non-null   object 
 3   education_level               5372 non-null   object 
 4   marital_status                5372 non-null   object 
 5   people_in_the_household       5372 non-null   int64  
 6   annual_household_income       5372 non-null   int64  
 7   citizenship_status            5372 non-null   object 
 8   total_cholesterol             5372 non-null   float64
 9   HDL-cholesterol               5372 non-null   float64
 10  LDL-cholesterol               5372 non-null   float64
 11  triglyceride                  5372 non-null   float64
 12  lymphocyte_percent            5372 non-null   float64
 13

In [6]:
df.head()

Unnamed: 0_level_0,gender,age,race,education_level,marital_status,people_in_the_household,annual_household_income,citizenship_status,total_cholesterol,HDL-cholesterol,...,number_cigarettes_last_month,hours_sleep,trouble_sleeping_history,minutes_sedentary_activity,physical_activity_level,macro_score,mineral_score,vitamin_score,total_nutrient_score,depression_status
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
73557,male,69,black,high_school,separated,3,4,citizen,167.0,65.0,...,0,7,Yes,600,sedentary,3,3,2,8,not_depressed
73558,male,54,white,high_school,married,4,7,citizen,170.0,50.0,...,1,9,No,540,moderately_active,3,1,1,5,not_depressed
73559,male,72,white,some_college,married,2,10,citizen,126.0,60.0,...,0,8,No,300,moderately_active,3,0,3,6,not_depressed
73561,female,73,white,college_graduate,married,2,12,citizen,201.0,85.0,...,0,9,No,480,moderately_active,3,0,2,5,not_depressed
73562,male,56,mexican,some_college,divorced,1,9,citizen,226.0,38.0,...,0,5,No,360,active,1,0,5,6,depressed


In [7]:
df.columns.tolist()

['gender',
 'age',
 'race',
 'education_level',
 'marital_status',
 'people_in_the_household',
 'annual_household_income',
 'citizenship_status',
 'total_cholesterol',
 'HDL-cholesterol',
 'LDL-cholesterol',
 'triglyceride',
 'lymphocyte_percent',
 'monocyte_percent',
 'neutrophils_percent',
 'eosinophils_percent',
 'basophils_percent',
 'hematocrit_percent',
 'vitamin_B12',
 'albumin',
 'AST',
 'ALT',
 'blood_urea_nitrogen',
 'bicarbonate',
 'total_calcium',
 'CPK',
 'chloride',
 'creatinine',
 'globulin',
 'glucose',
 'gamma_glutamyl_transferase',
 'iron',
 'potassium',
 'lactate_dehydrogenase',
 'sodium',
 'osmolality',
 'phosphorus',
 'total_bilirubin',
 'total_protein',
 'uric_acid',
 'pulse',
 'irregular_pulse',
 'systolic_blood_pressure',
 'diastoli_blood_pressure',
 'BMI',
 'alcohol_lifetime',
 'binge_drinking_lifetime',
 'number_drinks_last_year',
 'age_started_smoking',
 'smoke_lifetime',
 'number_cigarettes_last_month',
 'hours_sleep',
 'trouble_sleeping_history',
 'minutes_

In [8]:
%run "functions.ipynb"
features = ['alcohol_lifetime',
             'binge_drinking_lifetime',
             'number_drinks_last_year',
             'age_started_smoking',
             'smoke_lifetime',
             'number_cigarettes_last_month',
             'hours_sleep',
             'trouble_sleeping_history',
             'minutes_sedentary_activity',
             'physical_activity_level',
             'macro_score',
             'mineral_score',
             'vitamin_score',
             'total_nutrient_score',
             'depression_status']
df_quest_diet_dem = select_features(df, features)
df_quest_diet_dem = preprocess_binary_responses(df_quest_diet_dem)
df_quest_diet_dem = one_hot_encode_object_columns(df_quest_diet_dem)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5372 entries, 73557 to 83727
Data columns (total 17 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   alcohol_lifetime                           5372 non-null   int64
 1   binge_drinking_lifetime                    5372 non-null   int64
 2   number_drinks_last_year                    5372 non-null   int64
 3   age_started_smoking                        5372 non-null   int64
 4   smoke_lifetime                             5372 non-null   int64
 5   number_cigarettes_last_month               5372 non-null   int64
 6   hours_sleep                                5372 non-null   int64
 7   trouble_sleeping_history                   5372 non-null   int64
 8   minutes_sedentary_activity                 5372 non-null   int64
 9   macro_score                                5372 non-null   int64
 10  mineral_score                              

In [9]:
# df_std = standarized_data(df_quest_diet_dem)
# df_std.head()

In [10]:
# df_quest_diet_dem = normalized_data(df_quest_diet_dem)
# df_quest_diet_dem.head()

SPLIT TRAIN AND TEST

In [11]:
# %run "functions.ipynb"
# X_train_std, X_test_std, y_train_std, y_test_std = split_data(df_std, target_column='depression_status', test_size=0.3, random_state=42)

In [12]:
%run "functions.ipynb"
X_train, X_test, y_train, y_test = split_data(df_quest_diet_dem, target_column='depression_status', test_size=0.3, random_state=42)

OVERSAMPLING

In [13]:
%run "functions.ipynb"
X_train, y_train = oversample_data(X_train, y_train, strategy='auto')

Before oversampling: [(0, 3402), (1, 358)]
After oversampling: [(0, 3402), (1, 3402)]


UNDERSAMPLING

In [14]:
%run "functions.ipynb"
X_train_std, y_train_std = undersample_data(X_train_std, y_train_std)


NameError: name 'X_train_std' is not defined

MODELOS... DECISION TREE, RANDOM FOREST + SHAP, LIME...

In [None]:
%run "functions.ipynb"
decision_tree_classification(X_train, y_train, X_test, y_test, max_depth=4, random_state=42)

In [None]:
%run "functions.ipynb"
explain_tree(X_train, y_train, X_test, y_test)

Explainable Boosting Machine

In [None]:
%run "functions.ipynb"
explain_ebm(X_train, y_train, X_test, y_test)

Random Forest (black-box model)

In [None]:
%run "functions.ipynb"
rf = random_forest(X_train, y_train, X_test, y_test, n_estimators=100, random_state=42, class_weight='balanced')

In [None]:
%run "functions.ipynb"
gbm = random_forest(X_train, y_train, X_test, y_test)

In [None]:
%run "functions.ipynb"
lr = lr(X_train_std, y_train_std, X_test_std, y_test_std)

In [None]:
import shap
shap.initjs()

# Entrenar el modelo de regresión logística
logistic_model = LogisticRegression()
logistic_model.fit(X_train_std, y_train_std)

sample = shap.sample(X_train_std, 100)

# Calculate SHAP values for the selected data
shap_values = explainer.shap_values(X_test.iloc[1:30])

# Crear explainer usando KernelExplainer
explainer = shap.KernelExplainer(logistic_model.predict_proba, sample)

# Obtener valores SHAP
shap_values = explainer.shap_values(X_test_std)

display(shap.force_plot(explainer.expected_value[1], shap_values[1], X_test.iloc[1:30]))  

# Visualizar
shap.summary_plot(shap_values, X_test_std)

In [None]:
shap.initjs()
display(shap.force_plot(explainer.expected_value[1], shap_values[1], X_test.iloc[0:300]))  

In [None]:
shap.summary_plot(shap_values, X_test_std)

In [None]:
# Inicializa los clasificadores base
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lr = LogisticRegression()

In [None]:
# Crea el Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('knn', knn),
    ('rf', rf),
], voting='soft')  # Puedes cambiar a 'soft' si deseas

In [None]:
# Ajusta el modelo a tus datos
voting_classifier.fit(X_train, y_train)

In [None]:
%run "functions.ipynb"
voting_classifier = voting_classifier(X_train, y_train, X_test, y_test,voting='hard')

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Mejores hiperparámetros: {best_params}")
print(f"Precisión del mejor modelo: {best_model.score(X_test, y_test):.2f}")

LIME

In [None]:
%run "functions.ipynb"
lime_explanation = lime_explanation(rf, X_train, X_test, y_test, num_samples=500)

SHAP

In [None]:
%run "functions.ipynb"
shap_explanation(rf, X_test, start_index=1, end_index=30)