In [42]:
import pandas as pd
import numpy as np
import janitor
from pathlib import Path
import missingno
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from catboost.core import CatBoostClassifier


In [43]:
%matplotlib inline

sns.set_theme(
    rc={
      "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")

In [44]:
data_path = Path("data")
train_path = Path("data", "train.csv")
test_path = Path("data", "test_public.csv")
test_private_path = Path("data", "test_private.csv")

train_parquet_path = Path("data", "train.parquet")
test_parquet_path = Path("data", "test_public.parquet")

In [45]:
# train_df = pd.read_csv(train_path)  
test_df = pd.read_csv(test_path)
test_private_df = pd.read_csv(test_private_path)

In [46]:
train_df = pd.read_parquet(train_parquet_path).drop(columns=["ID"])
# test_df = pd.read_parquet(test_parquet_path)

Si no hago esto mi PC Explota

In [47]:
train_df.dropna(inplace=True)

In [48]:
target_col = "CHD_OR_MI"
numerical_cols = ["AGE", "BMI"]
categorical_cols = list(set(train_df.columns) - set(numerical_cols) - set(["ID", "CHD_OR_MI"]))
train_df[categorical_cols] = train_df[categorical_cols].astype("category")
test_df[categorical_cols] = test_df[categorical_cols].astype("category")
test_private_df[categorical_cols] = test_private_df[categorical_cols].astype("category")

In [49]:
X_train = train_df.drop(columns=['CHD_OR_MI'])
y_train = train_df['CHD_OR_MI']

X_test = test_df.drop(columns=['CHD_OR_MI', 'ID'])
y_test = test_df['CHD_OR_MI']

X_test_private = test_private_df.drop(columns=['ID'])

In [50]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="passthrough"  # Si deseas dejar columnas adicionales sin tocar
)



In [51]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_test_private_processed = preprocessor.transform(X_test_private)

In [52]:
categorical_feature_names = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(categorical_feature_names)
X_train_processed = pd.DataFrame(X_train_processed, columns=all_feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=all_feature_names)


In [None]:
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(random_state=42)
X_train_bal, y_train_bal = smt_tomek.fit_resample(X_train_processed, y_train)




In [66]:
X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(
    X_train_bal, 
    y_train_bal, 
    test_size=0.2,
    random_state=52, 
    stratify=y_train_bal
)

In [67]:
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='F1',
    random_seed=52,
    verbose=100,
    depth=7,
    iterations=1000,
    learning_rate=0.1,
    l2_leaf_reg=3,
    subsample=0.8,
    bootstrap_type='Bernoulli',
    random_strength=2,
    border_count=128,
    od_type='Iter',             # Activar early stopping
    od_wait=50,                 # Espera 50 iteraciones sin mejora
    use_best_model=True,         # Usa el mejor modelo según eval_set
    verbose=100,
)

model.fit(
    X_train_fold, 
    y_train_fold,
    eval_set=(X_val_fold, y_val_fold)
)

0:	learn: 0.7699575	test: 0.7716070	best: 0.7716070 (0)	total: 72.6ms	remaining: 1m 12s
100:	learn: 0.9384710	test: 0.9380723	best: 0.9380723 (100)	total: 7.54s	remaining: 1m 7s
200:	learn: 0.9489704	test: 0.9484990	best: 0.9484990 (200)	total: 13.7s	remaining: 54.5s
300:	learn: 0.9519242	test: 0.9504090	best: 0.9504208 (299)	total: 20.2s	remaining: 46.9s
400:	learn: 0.9535374	test: 0.9508295	best: 0.9508295 (400)	total: 27.5s	remaining: 41.1s
500:	learn: 0.9545947	test: 0.9508993	best: 0.9509753 (483)	total: 33.8s	remaining: 33.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9510059172
bestIteration = 508

Shrink model to first 509 iterations.


<catboost.core.CatBoostClassifier at 0x15d0c91be00>

In [68]:
# Predicciones y evaluación
y_pred = model.predict(X_test_processed)
print(f"F1-Score: {f1_score(y_test, y_pred)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

F1-Score: 0.9565901988808411

Classification Report:
               precision    recall  f1-score   support

         0.0       0.46      0.09      0.16      3532
         1.0       0.93      0.99      0.96     39874

    accuracy                           0.92     43406
   macro avg       0.69      0.54      0.56     43406
weighted avg       0.89      0.92      0.89     43406


Confusion Matrix:
 [[  333  3199]
 [  385 39489]]


In [69]:
y_pred_proba = model.predict_proba(X_test_processed)[:, 1]

# Ajuste del umbral
thresholds = np.linspace(0, 1, 100)
f1_scores = []

for threshold in thresholds:
    y_pred_temp = (y_pred_proba >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_temp))

# Mejor umbral basado en F1-Score
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Mejor umbral para F1-Score: {best_threshold:.2f}")
# Predicciones finales con el mejor umbral
y_pred = (y_pred_proba >= best_threshold).astype(int)
print(f"F1-Score: {f1_score(y_test, y_pred)}")

# Evaluación final
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Mejor umbral para F1-Score: 0.38
F1-Score: 0.957970246978961

Classification Report:
               precision    recall  f1-score   support

         0.0       0.60      0.03      0.06      3532
         1.0       0.92      1.00      0.96     39874

    accuracy                           0.92     43406
   macro avg       0.76      0.52      0.51     43406
weighted avg       0.90      0.92      0.89     43406


Confusion Matrix:
 [[  118  3414]
 [   78 39796]]


In [70]:
# y_test_private_pred = model.predict_proba(X_test_private_processed)[:, 1]
# y_test_private_pred = (y_test_private_pred >= best_threshold).astype(int)
y_test_private_pred = model.predict(X_test_private_processed)

submission_private = pd.DataFrame({
    "ID": test_private_df["ID"],
    "CHD_OR_MI": y_test_private_pred
})

submission_public = pd.DataFrame({
  "ID": test_df["ID"],
  "CHD_OR_MI": y_pred
})
submission_df = pd.concat([submission_private, submission_public] ,ignore_index=True)
# submission_df["CHD_OR_MI"] = submission_df["CHD_OR_MI"].astype
submission_df
submission_df.to_csv("submission.csv", index=False)