In [45]:
import pandas as pd
import numpy as np
import janitor
from pathlib import Path
import missingno
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import scipy
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [46]:
%matplotlib inline

sns.set_theme(
    rc={
      "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")

In [47]:
data_path = Path("data")
train_path = Path("data", "train.csv")
test_path = Path("data", "test_public.csv")
test_private_path = Path("data", "test_private.csv")

train_parquet_path = Path("data", "train.parquet")
test_parquet_path = Path("data", "test_public.parquet")

In [48]:
# train_df = pd.read_csv(train_path)  
test_df = pd.read_csv(test_path)
test_private_df = pd.read_csv(test_private_path)

In [49]:
train_df = pd.read_parquet(train_parquet_path).drop(columns=["ID"])
# test_df = pd.read_parquet(test_parquet_path)

In [50]:
target_col = "CHD_OR_MI"
numerical_cols = ["AGE", "BMI"]
categorical_cols = list(set(train_df.columns) - set(numerical_cols) - set(["ID", "CHD_OR_MI"]))
train_df[categorical_cols] = train_df[categorical_cols].astype("category")
test_df[categorical_cols] = test_df[categorical_cols].astype("category")
test_private_df[categorical_cols] = test_private_df[categorical_cols].astype("category")



In [7]:
class ImputerG:
    def __init__(self):
        self.num_imputer = sklearn.impute.SimpleImputer(strategy="median")
        self.cat_imputer = sklearn.impute.SimpleImputer(strategy="most_frequent")
    
    def fit(self, df):
        self.num_imputer.fit(df[numerical_cols])
        self.cat_imputer.fit(df[categorical_cols])
        return self

    def fit_transform(self, df):
        df.loc[:, numerical_cols] = self.num_imputer.fit_transform(df[numerical_cols])
        df.loc[:, categorical_cols] = self.cat_imputer.fit_transform(df[categorical_cols])
        return df

    def transform(self, df):
        df.loc[:, numerical_cols] = self.num_imputer.transform(df[numerical_cols])
        df.loc[:, categorical_cols] = self.cat_imputer.transform(df[categorical_cols])
        return df


Local imputer class 0

In [8]:
# train_df_class_0 = train_df.query("CHD_OR_MI == 0").copy()

# local_class_0_imputer = sklearn.impute.KNNImputer()
# train_df_class_0 = pd.DataFrame(
#     local_class_0_imputer.fit_transform(train_df_class_0),
#     columns=train_df.columns,
#     index=train_df_class_0.index  # Preservar el índice original
# )

Local imputer class 1

In [9]:
# train_df_class_1 = train_df.query("CHD_OR_MI == 1").copy()

# local_class_1_imputer = ImputerG()
# train_df_class_1 = local_class_1_imputer.fit_transform(train_df_class_1)

In [10]:
# train_df = pd.concat([train_df_class_0, train_df_class_1])

Global imputer

In [11]:
train_df.dropna(inplace=True)

In [12]:
global_imputer = ImputerG()
train_df = global_imputer.fit_transform(train_df)

In [13]:
test_df = global_imputer.transform(test_df)
test_private_df = global_imputer.transform(test_private_df)

In [14]:
X_train = train_df.drop(columns=['CHD_OR_MI'])
y_train = train_df['CHD_OR_MI']

X_test = test_df.drop(columns=['CHD_OR_MI', 'ID'])
y_test = test_df['CHD_OR_MI']

X_test_private = test_private_df.drop(columns=['ID'])

In [15]:
from imblearn.over_sampling import SMOTE
# Balanceo con SMOTEENN
smote_enn = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train, y_train)



In [16]:
X_train_balanced, y_train_balanced = X_train, y_train

In [17]:
y_train_balanced.value_counts(normalize=True)

CHD_OR_MI
1.0    0.91374
0.0    0.08626
Name: proportion, dtype: float64

In [26]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
import numpy as np
import lightgbm as lgb

# Espacio de hiperparámetros para buscar
param_grid = {
    'num_leaves': [15, 31, 63],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Configuración del modelo LGBM base
lgbm_model = lgb.LGBMClassifier(
    objective='binary',
    random_state=42
)

# Configurar GridSearchCV con F1-Score como métrica
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid,
    scoring=make_scorer(f1_score, average='macro'),
    cv=3,  # Validación cruzada con 3 particiones
    verbose=3,
    n_jobs=-1
)

# Ejecutar la búsqueda
grid_search.fit(X_train_balanced, y_train_balanced)

# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
print(f"Mejores Hiperparámetros: {best_params}")

# Entrenar el modelo con los mejores parámetros
best_lgbm_model = grid_search.best_estimator_

# Predicciones de probabilidades
y_pred_proba = best_lgbm_model.predict_proba(X_test)[:, 1]

# Ajuste del umbral basado en F1-Score
thresholds = np.linspace(0.1, 0.9, 100)
f1_scores = []

for threshold in thresholds:
    y_pred_temp = (y_pred_proba >= threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_temp))

# Encontrar el mejor umbral
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Mejor umbral para F1-Score: {best_threshold:.2f}")

# Predicciones finales con el mejor umbral
y_pred = (y_pred_proba >= best_threshold).astype(int)

# Evaluación final
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))




Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[LightGBM] [Info] Number of positive: 193733, number of negative: 18289
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 402
[LightGBM] [Info] Number of data points in the train set: 212022, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.913740 -> initscore=2.360181
[LightGBM] [Info] Start training from score 2.360181
Mejores Hiperparámetros: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 500, 'num_leaves': 63, 'subsample': 0.6}
Mejor umbral para F1-Score: 0.16

Classification Report:
               precision    recall  f1-score   support

         0.0       0.65      0.01      0.01      3532
         1.0       0.92      1.00      0.96     39874

    accurac

In [38]:
# # Obtener la importancia de las características
# feature_importances = model.get_feature_importance(prettified=True)

# # Mostrar las importancias
# print(feature_importances)
model=best_lgbm_model

In [39]:
y_test_private_pred = model.predict_proba(X_test_private)[:, 1]
y_test_private_pred = (y_test_private_pred >= best_threshold).astype(int)
submission_private = pd.DataFrame({
    "ID": test_private_df["ID"],
    "CHD_OR_MI": y_test_private_pred
})

In [40]:
submission_public = pd.DataFrame({
  "ID": test_df["ID"],
  "CHD_OR_MI": y_pred
})

In [41]:
submission_df = pd.concat([submission_private, submission_public] ,ignore_index=True)
# submission_df["CHD_OR_MI"] = submission_df["CHD_OR_MI"].astype
submission_df

Unnamed: 0,ID,CHD_OR_MI
0,PID2022_152435,1
1,PID2022_299594,1
2,PID2022_065147,1
3,PID2022_333651,1
4,PID2022_317306,1
...,...,...
86807,PID2022_256399,1
86808,PID2022_326390,1
86809,PID2022_178405,1
86810,PID2022_220522,1


In [42]:
submission_df.to_csv("submission.csv", index=False)

In [43]:
y_test.value_counts(normalize=True)

CHD_OR_MI
1.0    0.918629
0.0    0.081371
Name: proportion, dtype: float64

In [44]:
submission_df["CHD_OR_MI"].value_counts(normalize=True)

CHD_OR_MI
1    0.999332
0    0.000668
Name: proportion, dtype: float64