# Aprendizaje Supervisado
Este archivo contiene el preprocesamiento y entrenamiento realizado para los modelos de aprendizaje supervisado.

**Importación de las librerías y carga de datos**

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn.over_sampling  as ovs
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
    confusion_matrix,
    roc_auc_score,
)
import shap
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV



In [None]:
X_train = pd.read_csv('/kaggle/input/preprocessedbasedadatos/X_train.csv')
y_train = pd.read_csv('/kaggle/input/preprocessedbasedadatos/y_train.csv')
X_val = pd.read_csv('/kaggle/input/preprocessedbasedadatos/X_val.csv')
y_val = pd.read_csv('/kaggle/input/preprocessedbasedadatos/y_val.csv')

****Aplicación de Label Encoding****

In [None]:
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

for col in cat_cols:
    le = LabelEncoder()
    
    le.fit(X_train[col].astype(str))
    X_train[col] = le.transform(X_train[col].astype(str))    

    if col in X_val.columns:
        val_vals = X_val[col].astype(str)
        # Identificar clases desconocidas
        mask_desconocidas = ~val_vals.isin(le.classes_)
        
        if mask_desconocidas.any():
            # Agregar 'UNK' si no existe
            if 'UNK' not in le.classes_:
                le.classes_ = np.append(le.classes_, 'UNK')
            # Reemplazar valores desconocidos por 'UNK'
            val_vals.loc[mask_desconocidas] = 'UNK'

        # Finalmente transformar
        X_val[col] = le.transform(val_vals)
print("\nNuevas dimensiones:")
print("Train:", X_train.shape)
print("Val:", X_val.shape)

****Aplicación de Hashing Encoding****

In [None]:
#from category_encoders import HashingEncoder

#encoder = HashingEncoder(cols=cat_cols, n_components=8)  # prueba con 8-16, depende de cardinalidad

#X_train_cat = encoder.fit_transform(X_train[cat_cols])
#X_val_cat   = encoder.transform(X_val[cat_cols])

# Reconstruir datasets con columnas numéricas originales + categóricas codificadas
#X_train_final = pd.concat(
  #  [X_train.drop(columns=cat_cols, errors='ignore'), X_train_cat],
 #   axis=1
#)
#X_val_final = pd.concat(
#    [X_val.drop(columns=cat_cols, errors='ignore'), X_val_cat],
#    axis=1
#)

# Sobrescribir variables
#X_train = X_train_final
#X_val = X_val_final

#print("\nNuevas dimensiones:")
#print("Train:", X_train.shape)
#print("Val:", X_val.shape)

#print("\nNaNs en train:", X_train.isna().sum().sum())
#print("NaNs en val:", X_val.isna().sum().sum())

****Selección de columnas****

In [None]:
importancias = mutual_info_classif(X_train,y_train,random_state=42)
importancias = pd.Series(importancias, index = X_train.columns).sort_values(ascending=False)
top = 250
top_columnas = importancias.head(top).index.tolist()

X_train = X_train[top_columnas]
X_val = X_val[top_columnas]

plt.figure(figsize=(10,6))
importancias.head(250).plot(kind='bar')
plt.title('Top 150 columnas según relevancia')
plt.tight_layout()
plt.show()

In [None]:


# Modelo base
#modelo = LGBMClassifier(
    #n_estimators=500,
   # max_depth=-1,
   # learning_rate=0.05,
   # num_leaves=31,
  #  subsample=0.8,
  #  colsample_bytree=0.8,
 #   random_state=42,
 #   n_jobs=-1
#)

#step_size = 5
#n_features_initial = X_train.shape[1]  # número total de variables

#rfecv = RFECV(
 #   estimator=modelo,
 #  step=step_size,
 #   cv=StratifiedKFold(3),
 #   scoring='average_precision',
 #   min_features_to_select=50,
 #   n_jobs=-1,
 #   verbose=1
#)

#rfecv.fit(X_train, y_train)
#print("Número óptimo de variables:", rfecv.n_features_)

#num_iters = len(rfecv.cv_results_['mean_test_score'])
#num_features_per_iter = [
 #   n_features_initial - (i * step_size) for i in range(num_iters)
#]

#plt.figure(figsize=(10, 5))
#plt.plot(
   # num_features_per_iter,
  #  rfecv.cv_results_['mean_test_score'],
 #   marker='o'
#)
#plt.xlabel("Número de variables")
#plt.ylabel("PR AUC")
#plt.title("RFECV - LightGBM (step = 5)")
#plt.grid(alpha=0.3)
#plt.show()

#X_train.to_csv("train_reduced_rfecv.csv", index=False)
#X_val.to_csv("val_reduced_rfecv.csv", index=False)
#y_train.to_csv("y_train_reduced_rfecv.csv",index=False)
#y_val.to_csv("y_val_reduced_rfecv.csv",index=False)


****Aplicación de la técnica de SMOTE****

In [None]:
#smote = SMOTE(sampling_strategy= 0.4 , random_state=42)
#batch_size = 50000

# Barajar X_train e y_train mateniendo los indices
#idx = np.random.permutation(len(X_train))
#X_train = X_train.iloc[idx].reset_index(drop=True)
#y_train = y_train.iloc[idx].reset_index(drop=True)

#X_res_all, y_res_all = [], []

# Aplicar SMOTE por lotes
#for i in range(0, len(X_train), batch_size):
 #   X_b = X_train.iloc[i:i+batch_size]
 #   y_b = y_train.iloc[i:i+batch_size]

  #  if y_b.sum() > 0: 
   #     X_r, y_r = smote.fit_resample(X_b, y_b)
   # else:
    #    X_r, y_r = X_b, y_b  

   # X_res_all.append(X_r)
   # y_res_all.append(y_r)

#X_train_smote = pd.concat(X_res_all, ignore_index=True)
#y_train_smote = pd.concat(y_res_all, ignore_index=True)

#print("Shape final:", X_train_smote.shape)
#print("Distribución de clases después de SMOTE:\n", y_train_smote.value_counts(normalize=True))

#X_train = X_train_smote
#y_train = y_train_smote

# Exportar conjuntos
#X_train.to_csv("train_smote.csv", index=False)
#y_train.to_csv("y_train_smote.csv", index=False)

#X_val.to_csv("val_smote.csv", index=False)
#y_val.to_csv("y_val_smote.csv", index=False)




In [None]:
# Lista de modelos supervisados
modelos = {

    "XGBoost": XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=500,      
       max_depth=None,
        random_state=42,
        n_jobs=-1
    ),
    
    "LightGBM" : LGBMClassifier (
        n_estimators = 500, 
        max_depth = -1,
        learning_rate = 0.05,
        num_leaves = 31,
        subsample = 0.8,
        colsample_bytree = 0.8,
        random_state= 42,
        n_jobs = -1
    )
}

resultados = []
threshold =  0.15
for nombre, modelo in modelos.items():
    print(f"\nEntrenando {nombre}...")
    modelo.fit(X_train, y_train)

    # Predicciones y probabilidades
    y_pred = modelo.predict(X_val)
    if hasattr(modelo, "predict_proba"):
        y_scores = modelo.predict_proba(X_val)[:, 1]
    else:
        y_scores = modelo.decision_function(X_val)

    # Métricas
    y_pred = (y_scores>=threshold).astype(int)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)
    pr_auc = average_precision_score(y_val, y_scores)
    roc_auc = roc_auc_score(y_val,y_scores)
    cm = confusion_matrix(y_val, y_pred)

    resultados.append({
        "Modelo": nombre,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "PR AUC": pr_auc,
        "ROC AUC": roc_auc,
        "Matriz_Confusion": cm
    })

# Mostrar tabla comparativa
df_resultados = pd.DataFrame(resultados)
print("\nResultados comparativos:")
print(df_resultados[["Modelo", "Accuracy", "Precision", "Recall", "F1", "PR AUC","ROC AUC"]])

# Mostrar matrices de confusión
for r in resultados:
    print(f"\nMatriz de confusión para {r['Modelo']}:")
    print(r["Matriz_Confusion"])




**Top importancias para Random Forest**

In [None]:
# Obtener importancias
importancias = modelo.feature_importances_
nombres_caracteristicas = X_train.columns

df_importancias = pd.DataFrame({
    "Característica": nombres_caracteristicas,
    "Importancia": importancias
}).sort_values(by="Importancia", ascending=False)

# Top 10
df_top10 = df_importancias.head(10)

print("\nTop 10 características más importantes:")
print(df_top10)

plt.figure(figsize=(10,6))
plt.barh(df_top10["Característica"], df_top10["Importancia"], color="#1f77b4")
plt.gca().invert_yaxis()  # Mostrar la más importante arriba
plt.title("Top 10 - Importancia de características (Random Forest)", fontsize=16)
plt.xlabel("Importancia", fontsize=12)
plt.ylabel("Característica", fontsize=12)
plt.tight_layout()
plt.show()