In [74]:
import pandas as pd
import numpy as np
import janitor
from pathlib import Path
import missingno
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from catboost.core import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

In [75]:
%matplotlib inline

sns.set_theme(
    rc={
      "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")

In [76]:
data_path = Path("data")
train_path = Path("data", "train.csv")
test_path = Path("data", "test_public.csv")
test_private_path = Path("data", "test_private.csv")

train_parquet_path = Path("data", "train.parquet")
test_parquet_path = Path("data", "test_public.parquet")

In [77]:
# train_df = pd.read_csv(train_path)  
test_df = pd.read_csv(test_path)
test_private_df = pd.read_csv(test_private_path)

In [78]:
train_df = pd.read_parquet(train_parquet_path).drop(columns=["ID"])
# test_df = pd.read_parquet(test_parquet_path)

Si no hago esto mi PC Explota

In [79]:
train_df.dropna(inplace=True)

In [80]:
target_col = "CHD_OR_MI"
numerical_cols = ["AGE", "BMI"]
categorical_cols = list(set(train_df.columns) - set(numerical_cols) - set(["ID", "CHD_OR_MI"]))
# categorical_cols = ["SEX","CANCER","DIFFICULTY_WALKING", "HIV", "SMOKE", "MENTAL_HEALTH"]
features_cols = numerical_cols + categorical_cols
train_df[categorical_cols] = train_df[categorical_cols].astype("category")
test_df[categorical_cols] = test_df[categorical_cols].astype("category")
test_private_df[categorical_cols] = test_private_df[categorical_cols].astype("category")

In [81]:
X_train = train_df.drop(columns=['CHD_OR_MI'])
y_train = train_df['CHD_OR_MI']

X_test = test_df.drop(columns=['CHD_OR_MI', 'ID'])
y_test = test_df['CHD_OR_MI']

X_test_private = test_private_df.drop(columns=['ID'])

In [82]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class BMITransformer(BaseEstimator, TransformerMixin):

    def __init__(self, 
                 bins=[0, 20, 25, 30, 35, 40, np.inf],
                 labels=[1, 2, 3, 4, 5, 6]):
                         
                #  labels=["Underweight", "Normal", "Overweight", "Obesity I", 
                        #  "Obesity II", "Extreme Obesity"]):
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.astype(float)

        # Dividir por 100 para desescalar
        X_real = X / 100.0

        # Binning con pandas cut
        # pd.cut retorna Series categórica, la convertimos a array
        X_binned = pd.cut(
            X_real,
            bins=self.bins,
            labels=self.labels,
            include_lowest=True
        )

        # Retornamos como un array de shape (n_samples, 1)
        return np.array(X_binned).reshape(-1, 1)


class AGETransformer(BaseEstimator, TransformerMixin):

    def __init__(self,
                 bins=[0, 20, 25 , 35, 50, 65, np.inf],
                 labels=[1, 2, 3, 4, 5, 6]):
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """
        Espera que X sea un array/serie de una sola columna (AGE).
        """
        X = X.astype(float)  # Asegurarse de tener float
        X_binned = pd.cut(
            X,
            bins=self.bins,
            labels=self.labels,
            include_lowest=True
        )
        return np.array(X_binned).reshape(-1, 1)


In [83]:
bmi_pipeline = Pipeline(steps=[
    ("bmi_transformer", BMITransformer()),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

# Pipeline para binning de AGE
age_pipeline = Pipeline(steps=[
    ("age_transformer", AGETransformer()),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("bmi_bin", bmi_pipeline, "BMI"),           # Aplica a la col BMI
        ("age_bin", age_pipeline, "AGE"), 
        # ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"  
)


In [84]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_test_private_processed = preprocessor.transform(X_test_private)

In [85]:
X_train_processed = pd.DataFrame(X_train_processed, columns=features_cols)
X_test_processed = pd.DataFrame(X_test_processed, columns=features_cols)


In [86]:
# smote = SMOTE(random_state=42)
# X_train_bal, y_train_bal = smote.fit_resample(X_train_processed, y_train)


In [87]:
# y_train_bal.value_counts()

In [94]:
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='F1',
    random_seed=52,
    verbose=200,
    class_weights=[5,1],
    depth=15,               
    iterations=1000,       
    learning_rate=0.01,
    l2_leaf_reg=5,        # Regularización L2 (controla sobreajuste)
    # subsample=0.8,        # Selecciona ~80% de muestras en cada árbol (bagging)
    # bootstrap_type='Bernoulli',  # Estrategia de muestreo más estable
    # random_strength=2,    # Controla la aleatoriedad de splits
    od_type='Iter',       # Early stopping: deja de entrenar si no mejora en X iteraciones seguidas
    od_wait=50,           # Paciencia: espera 50 iteraciones sin mejora
    # use_best_model=True   

)
model.fit(X_train_processed, y_train)

0:	learn: 0.8578876	total: 732ms	remaining: 12m 11s
200:	learn: 0.9034835	total: 3m 43s	remaining: 14m 47s
400:	learn: 0.9200984	total: 7m 46s	remaining: 11m 36s
600:	learn: 0.9302717	total: 12m 19s	remaining: 8m 10s
800:	learn: 0.9377165	total: 16m 43s	remaining: 4m 9s
999:	learn: 0.9434925	total: 21m 22s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x214767a0800>

In [95]:
# Predicciones y evaluación
y_pred = model.predict(X_test_processed)
print(f"F1-Score: {f1_score(y_test, y_pred)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

F1-Score: 0.9369652742828385

Classification Report:
               precision    recall  f1-score   support

         0.0       0.31      0.33      0.32      3532
         1.0       0.94      0.93      0.94     39874

    accuracy                           0.88     43406
   macro avg       0.62      0.63      0.63     43406
weighted avg       0.89      0.88      0.89     43406


Confusion Matrix:
 [[ 1161  2371]
 [ 2639 37235]]


In [96]:
y_pred_proba = model.predict_proba(X_test_processed)[:, 1]

# Ajuste del umbral
thresholds = np.linspace(0, 1, 100)
f1_scores = []

for threshold in thresholds:
    y_pred_temp = (y_pred_proba >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_temp))

# Mejor umbral basado en F1-Score
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Mejor umbral para F1-Score: {best_threshold:.2f}")
# Predicciones finales con el mejor umbral
y_pred = (y_pred_proba >= best_threshold).astype(int)
print(f"F1-Score: {f1_score(y_test, y_pred)}")

# Evaluación final
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Mejor umbral para F1-Score: 0.14
F1-Score: 0.957736320800077

Classification Report:
               precision    recall  f1-score   support

         0.0       0.59      0.01      0.03      3532
         1.0       0.92      1.00      0.96     39874

    accuracy                           0.92     43406
   macro avg       0.76      0.51      0.49     43406
weighted avg       0.89      0.92      0.88     43406


Confusion Matrix:
 [[   52  3480]
 [   36 39838]]


In [97]:
# y_test_private_pred = model.predict_proba(X_test_private_processed)[:, 1]
# y_test_private_pred = (y_test_private_pred >= best_threshold).astype(int)
y_test_private_pred = model.predict(X_test_private_processed)

submission_private = pd.DataFrame({
    "ID": test_private_df["ID"],
    "CHD_OR_MI": y_test_private_pred
})

submission_public = pd.DataFrame({
  "ID": test_df["ID"],
  "CHD_OR_MI": y_pred
})
submission_df = pd.concat([submission_private, submission_public] ,ignore_index=True)
# submission_df["CHD_OR_MI"] = submission_df["CHD_OR_MI"].astype
submission_df
submission_df.to_csv("submission.csv", index=False)

In [98]:
model.get_feature_importance(prettified=True)


Unnamed: 0,Feature Id,Importances
0,BMI,9.136773
1,FRIED_POTATOES,6.922738
2,AGE,6.89931
3,SEX,6.357314
4,MENTAL_HEALTH,5.735456
5,HIGH_CHOLESTEROL,5.728742
6,PHYSICAL_HEALTH,5.51519
7,SMOKE,5.181498
8,BLOOD_PRESSURE,5.017767
9,FRUITS,4.747164


In [99]:
# import pandas as pd
# import numpy as np
# from catboost import Pool

# # Suponiendo que tienes:
# # model (tu CatBoostClassifier ya entrenado)
# # X_train_processed (numpy array o DataFrame)
# # y_train (Series o array)

# # 1) Crea un Pool (objeto interno de CatBoost) con tus datos
# train_pool = Pool(data=X_train_processed, label=y_train)

# # 2) Obtener valores SHAP
# shap_values = model.get_feature_importance(
#     type="ShapValues",    # Indica que deseas valores SHAP
#     data=train_pool
# )
