In [1]:
from datetime import datetime, timedelta

print(f"Installing libraries... {datetime.now()}")
!pip install openpyxl --quiet
!pip install tensorflow scikit-learn --quiet
!pip install imbalanced-learn --quiet
!pip install xgboost --quiet

print(f"Loading require libraries... {datetime.now()}")
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization, Input, Flatten, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
print(f"Libraries Loaded!... {datetime.now()}")

Installing libraries... 2025-04-01 05:09:18.697089
Loading require libraries... 2025-04-01 05:09:24.421373
Libraries Loaded!... 2025-04-01 05:09:28.256539


In [2]:
def load_data():
  print(f"loading data... {datetime.now()}")
  df_orign = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/recommender_data.xlsx')
  print(f"data shape: {df_orign.shape}")
  print(f"data loaded... {datetime.now()}")
  return df_orign

def remove_outliers(df):
  df_no_out = df.copy()
  print(f"removing outliers started... {datetime.now()}")

  # Compute 99th percentiles for numeric columns
  caps = {
      'income_group': df_no_out['income_group'].quantile(0.99),
      'age_group': df_no_out['age_group'].quantile(0.99),
      'cantidad_transacciones': df_no_out['cantidad_transacciones'].quantile(0.99),
      'monto_transaccion': df_no_out['monto_transaccion'].quantile(0.99),
  }

  # Apply capping
  for col, cap in caps.items():
      df_no_out[col] = df_no_out[col].clip(upper=cap)

  # Adjust fecha_apertura to ensure year is at least 2011
  df_no_out['fecha_apertura'] = df_no_out['fecha_apertura'].apply(
      lambda x: x if x.year >= 2011 else x.replace(year=2011)
  )

  print(f"removing outliers finished... {datetime.now()}")

  return df_no_out

def print_max_min(df, message:str = None):

  print(f"print_max_min started... {datetime.now()}")
  if message is not None:
    print(f"{message}... {datetime.now()}")

  # Print min and max values for each column
  print("\nColumn Min & Max values after correction:")
  for col in ['income_group', 'age_group', 'cantidad_transacciones', 'monto_transaccion']:
      print(f"{col}: min={df[col].min()}, max={df[col].max()}")

  # For fecha_apertura, print min & max years
  min_date = df['fecha_apertura'].min()
  max_date = df['fecha_apertura'].max()
  print(f"fecha_apertura: min_year={min_date.year}, max_year={max_date.year}")

  print(f"print_max_min finished... {datetime.now()}")


def prepare_data(df_orign):
  print(f"preparing data... {datetime.now()}")
  df = df_orign.copy()

  # Load DataFrame (assuming df is already loaded)
  df['producto'] = df['producto'].str.strip()

  # Hardcoded mappings for categorical columns
  producto_mapping = {
      "Accidentes Personales": 0,
      "Ahorro Infantil": 1,
      "Ahorro Internacional": 2,
      "Ahorro Libreta": 3,
      "Ahorro TDD": 4,
      "BLACK": 5,
      "Banesco Asistencia": 6,
      "CLASICA": 7,
      "Corriente Transaccional": 8,
      "Depositos": 9,
      "Depositos Internacional": 10,
      "Desempleo/Incapacidad": 11,
      "GOLD": 12,
      "Hipotecario": 13,
      "Hospitalizacion": 14,
      "INFINITE": 15,
      "Nomina Electronica": 16,
      "PLATINUM": 17,
      "Personal": 18,
      "Proteccion ATM": 19,
      "SUPERCASHBACK": 20,
      "Seguro Hogar": 21,
      "Seguro Vida": 22,
      "Seguro por Cancer": 23,
      "Ultimos Gastos": 24,
      "Vehiculo": 25
  }

  education_mapping = {
      "Doctorate": 0,
      "Master": 1,
      "Secondary": 2,
      "Undergraduate": 3
  }

  aplicacion_mapping = {
      "CERTIFICADOS": 0,
      "CUENTAS": 1,
      "INSURANCE": 2,
      "PRESTAMOS": 3,
      "TDC": 4
  }

  df['producto_encoded'] = df['producto'].map(producto_mapping)
  df['education_level_encoded'] = df['education_level'].map(education_mapping)
  df['aplicacion_encoded'] = df['aplicacion'].map(aplicacion_mapping)

  # 2. Handle Missing Values in income_group and age_group
  df['income_group'].replace(0, np.nan, inplace=True)
  df['age_group'].replace(0, np.nan, inplace=True)

  df['income_group_filled'] = df['income_group'].fillna(df['income_group'].median())
  df['age_group_filled'] = df['age_group'].fillna(df['age_group'].median())

  # 3. Fill Null Values in Transaction-related Columns
  transaction_columns = ['fecha_transaccion', 'cantidad_transacciones', 'monto_transaccion']

  date_evaluation = datetime(2025, 2, 28)

  df[transaction_columns] = df[transaction_columns].replace(0, np.nan)
  df[transaction_columns] = df[transaction_columns].replace("", np.nan)

  df['cantidad_transacciones_filled'] = df['cantidad_transacciones'].fillna(df['cantidad_transacciones'].median())
  df['monto_transaccion_filled'] = df['monto_transaccion'].fillna(df['monto_transaccion'].median())

  # Ensure 'fecha_transaccion' is in datetime format
  df["fecha_transaccion"] = pd.to_datetime(df["fecha_transaccion"], errors='coerce')

  # Calculate Time Since Last Activity, NaT will result in NaN in the new column
  df["days_since_last_transaction"] = (date_evaluation - df["fecha_transaccion"]).dt.days
  df['days_since_last_transaction_filled'] = df['days_since_last_transaction'].fillna(df['days_since_last_transaction'].median())

  # Ensure 'fecha_transaccion' is in datetime format
  df["fecha_apertura"] = pd.to_datetime(df["fecha_apertura"], errors='coerce')

  # Calculate Time Since Last Activity, NaT will result in NaN in the new column
  df["days_since_product_opened"] = (date_evaluation - df["fecha_apertura"]).dt.days

  # Normalize numerical features
  scaler = StandardScaler()
  df["income_group_scaled"] = scaler.fit_transform(df[["income_group_filled"]])
  df["age_group_scaled"] = scaler.fit_transform(df[["age_group_filled"]])
  df["cantidad_transacciones_scaled"] = scaler.fit_transform(df[["cantidad_transacciones_filled"]])
  df["monto_transaccion_scaled"] = scaler.fit_transform(df[["monto_transaccion_filled"]])
  df["days_since_last_transaction_scaled"] = scaler.fit_transform(df[["days_since_last_transaction_filled"]])
  df["days_since_product_opened_scaled"] = scaler.fit_transform(df[["days_since_product_opened"]])

  # Sort by customer and fecha_apertura to ensure chronological order
  df.sort_values(by=['customer_id', 'fecha_apertura'], inplace=True)

  # Create the sequence column within each customer group
  df['purchase_sequence'] = df.groupby('customer_id').cumcount() + 1

  drop_columns = [
      # "customer_id",
      "income_group","age_group","education_level","aplicacion","producto","fecha_apertura","fecha_transaccion","cantidad_transacciones","monto_transaccion",
      "income_group_filled","age_group_filled","cantidad_transacciones_filled","monto_transaccion_filled","days_since_last_transaction","days_since_last_transaction_filled","days_since_product_opened"
  ]

  # Count the number of products per customer
  customer_product_counts = df['customer_id'].value_counts()

  # Get the list of customers who have only one product
  customers_to_exclude = customer_product_counts[customer_product_counts == 1].index

  # Exclude these customers from the dataframe
  df = df[~df['customer_id'].isin(customers_to_exclude)]

  # Reset index after filtering (optional)
  df.reset_index(drop=True, inplace=True)

  # Print confirmation
  print(f"Excluded {len(customers_to_exclude)} customers with only one product.")

  df.drop(columns=drop_columns, inplace=True)

  return df


def prepare_train_test_dataset(df):
  print(f"preparing training and test data... {datetime.now()}")
  # Get unique customer IDs
  unique_customers = df['customer_id'].unique()

  # Split unique customers into training and testing sets
  train_customers, test_customers = train_test_split(unique_customers, test_size=0.3, random_state=42)

  # Create training and testing DataFrames
  train_df = df[df['customer_id'].isin(train_customers)]
  test_df = df[df['customer_id'].isin(test_customers)]

  train_df.sort_values(by=['customer_id', 'purchase_sequence'], inplace=True)
  train_grouped = train_df.groupby("customer_id")
  x_train = [group.drop(columns=['customer_id']).iloc[:-1].values.tolist() for _, group in train_grouped]
  y_train = [group.iloc[-1]['producto_encoded'] for _, group in train_grouped]

  test_df.sort_values(by=['customer_id', 'purchase_sequence'], inplace=True)
  test_grouped = test_df.groupby("customer_id")
  x_test = [group.drop(columns=['customer_id']).iloc[:-1].values.tolist() for _, group in test_grouped]
  y_test = [group.iloc[-1]['producto_encoded'] for _, group in test_grouped]
  return train_df, test_df, x_train, y_train, x_test, y_test

def print_data_Validations(df, train_df, test_df, x_train, y_train, x_test, y_test):
  print(f"preparing validations... {datetime.now()}")
  print(f"original {len(df)}, original_unique_customers: {len(df['customer_id'].unique())}, train_df: {len(train_df)}, test_df: {len(test_df)}, total: {len(train_df)+len(test_df)}")
  print(f"train_df {len(train_df)}, train_unique_customers: {len(train_df['customer_id'].unique())}, x_train: {len(x_train)}, y_train: {len(y_train)}, total: {sum(len(x) for x in x_train) + len(y_train)}")
  print(f"test_df {len(test_df)}, test_unique_customers: {len(test_df['customer_id'].unique())}, x_test: {len(x_test)}, y_test: {len(y_test)}, total: {sum(len(x) for x in x_test) + len(y_test)}")

df_orig = load_data()
print_max_min(df_orig, "Original Dataset!")
df_orig_no_outliers = remove_outliers(df_orig)
print_max_min(df_orig_no_outliers, "Dataset with Outliers Removed!")
df = prepare_data(df_orig_no_outliers)
# df = prepare_data(df_orig)
train_df, test_df, x_train, y_train, x_test, y_test = prepare_train_test_dataset(df)
print_data_Validations(df, train_df, test_df, x_train, y_train, x_test, y_test)
print(f"Data Process Finished!... {datetime.now()}")

loading data... 2025-04-01 05:09:28.276780
data shape: (370793, 10)
data loaded... 2025-04-01 05:10:23.303597
print_max_min started... 2025-04-01 05:10:23.303775
Original Dataset!... 2025-04-01 05:10:23.303786

Column Min & Max values after correction:
income_group: min=0, max=1111111110000
age_group: min=20, max=130
cantidad_transacciones: min=1.0, max=16621.0
monto_transaccion: min=0.01, max=2325270500.0
fecha_apertura: min_year=1987, max_year=2025
print_max_min finished... 2025-04-01 05:10:23.316893
removing outliers started... 2025-04-01 05:10:23.328484
removing outliers finished... 2025-04-01 05:10:24.400048
print_max_min started... 2025-04-01 05:10:24.400390
Dataset with Outliers Removed!... 2025-04-01 05:10:24.400407

Column Min & Max values after correction:
income_group: min=0, max=800000
age_group: min=20, max=70
cantidad_transacciones: min=1.0, max=528.7199999999721
monto_transaccion: min=0.01, max=8275667.881199925
fecha_apertura: min_year=2011, max_year=2025
print_max_min 

In [3]:
#model #1

# Pad sequences to make them equal length
x_train_padded = pad_sequences(x_train, padding="post", dtype="float32")
x_test_padded = pad_sequences(x_test, padding="post", dtype="float32")

# Convert targets to numpy arrays
y_train = np.array(y_train, dtype=np.int32)
y_test = np.array(y_test, dtype=np.int32)

# Build Neural Network Model (Replaced Flatten with LSTM or Pooling)
model = keras.Sequential([
    LSTM(64, return_sequences=True, input_shape=(x_train_padded.shape[1], x_train_padded.shape[2])),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(len(df["producto_encoded"].unique()), activation='softmax')
])

# Compile Model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train Model
model.fit(x_train_padded, y_train, epochs=20, batch_size=32, validation_data=(x_test_padded, y_test))


Epoch 1/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 12ms/step - accuracy: 0.3888 - loss: 2.1553 - val_accuracy: 0.4787 - val_loss: 1.7446
Epoch 2/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.4948 - loss: 1.6900 - val_accuracy: 0.4879 - val_loss: 1.6686
Epoch 3/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.5111 - loss: 1.6072 - val_accuracy: 0.5087 - val_loss: 1.6099
Epoch 4/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 10ms/step - accuracy: 0.5189 - loss: 1.5748 - val_accuracy: 0.5111 - val_loss: 1.6069
Epoch 5/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.5291 - loss: 1.5391 - val_accuracy: 0.5189 - val_loss: 1.5591
Epoch 6/20
[1m1856/1856[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.5362 - loss: 1.5081 - val_accuracy: 0.5247 - val_loss: 1.5425
Epoc

<keras.src.callbacks.history.History at 0x79f7807f3350>

In [4]:
#Model #1 Evaluation

# --------------------
# 1. Function to Calculate Top-K Accuracy
# --------------------
def top_k_accuracy(y_true, y_pred, k=5):
    """
    Calculates the percentage of times the actual product is in the top K predictions.

    :param y_true: Array of true product indices
    :param y_pred: Array of predicted probabilities for each product
    :param k: Number of top predictions to consider
    :return: Top-K Accuracy score
    """
    correct_predictions = 0
    total_samples = len(y_true)

    for true_label, pred_probs in zip(y_true, y_pred):
        top_k_predictions = np.argsort(pred_probs)[-k:][::-1]  # Get top K product indices
        if true_label in top_k_predictions:
            correct_predictions += 1

    return correct_predictions / total_samples

# --------------------
# 2. Get Model Predictions
# --------------------
y_pred_probs = model.predict(x_test_padded)  # Predict probabilities for all test samples
# y_pred_probs = model.predict([x_test_product_ids_padded, x_test_extra_features_padded])  # Predict probabilities for all test samples

# --------------------
# 3. Evaluate the Model Using Top-K Accuracy
# --------------------
top_1_acc = top_k_accuracy(y_test, y_pred_probs, k=1)  # Exact match accuracy
top_3_acc = top_k_accuracy(y_test, y_pred_probs, k=3)  # Top 3 accuracy
top_5_acc = top_k_accuracy(y_test, y_pred_probs, k=5)  # Top 5 accuracy
top_8_acc = top_k_accuracy(y_test, y_pred_probs, k=8)  # Top 8 accuracy
top_10_acc = top_k_accuracy(y_test, y_pred_probs, k=10)  # Top 10 accuracy

print(f"Top-1 Accuracy: {top_1_acc:.4f}")  # Strict match accuracy
print(f"Top-3 Accuracy: {top_3_acc:.4f}")  # If correct product is in top 3 predictions
print(f"Top-5 Accuracy: {top_5_acc:.4f}")  # If correct product is in top 5 predictions
print(f"Top-8 Accuracy: {top_8_acc:.4f}")  # If correct product is in top 8 predictions
print(f"Top-10 Accuracy: {top_10_acc:.4f}")  # If correct product is in top 10 predictions

[1m796/796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Top-1 Accuracy: 0.5435
Top-3 Accuracy: 0.7698
Top-5 Accuracy: 0.8634
Top-8 Accuracy: 0.9370
Top-10 Accuracy: 0.9644


In [5]:
#Model #1 Evaluation - Including Recall and Precision!

# --------------------
# 1. Top-K Accuracy
# --------------------
def top_k_accuracy(y_true, y_pred, k=5):
    correct_predictions = 0
    total_samples = len(y_true)

    for true_label, pred_probs in zip(y_true, y_pred):
        top_k_predictions = np.argsort(pred_probs)[-k:][::-1]
        if true_label in top_k_predictions:
            correct_predictions += 1

    return correct_predictions / total_samples


# --------------------
# 2. Precision@K
# --------------------
def precision_at_k(y_true, y_pred, k=5):
    precision_scores = []

    for true_label, pred_probs in zip(y_true, y_pred):
        top_k_predictions = np.argsort(pred_probs)[-k:][::-1]
        if true_label in top_k_predictions:
            precision_scores.append(1 / k)  # 1 relevant item found in k predictions
        else:
            precision_scores.append(0)

    return np.mean(precision_scores)


# --------------------
# 3. Recall@K
# --------------------
def recall_at_k(y_true, y_pred, k=5):
    recall_scores = []

    for true_label, pred_probs in zip(y_true, y_pred):
        top_k_predictions = np.argsort(pred_probs)[-k:][::-1]
        if true_label in top_k_predictions:
            recall_scores.append(1)  # 1 out of 1 relevant item retrieved
        else:
            recall_scores.append(0)

    return np.mean(recall_scores)

# Predictions
y_pred_probs = model.predict(x_test_padded)

# K values
k_values = [1, 3, 5, 8, 10]

# Evaluate
for k in k_values:
    acc = top_k_accuracy(y_test, y_pred_probs, k)
    prec = precision_at_k(y_test, y_pred_probs, k)
    rec = recall_at_k(y_test, y_pred_probs, k)

    print(f"K={k} => Accuracy@{k}: {acc:.4f}, Precision@{k}: {prec:.4f}, Recall@{k}: {rec:.4f}")


[1m796/796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
K=1 => Accuracy@1: 0.5435, Precision@1: 0.5435, Recall@1: 0.5435
K=3 => Accuracy@3: 0.7698, Precision@3: 0.2566, Recall@3: 0.7698
K=5 => Accuracy@5: 0.8634, Precision@5: 0.1727, Recall@5: 0.8634
K=8 => Accuracy@8: 0.9370, Precision@8: 0.1171, Recall@8: 0.9370
K=10 => Accuracy@10: 0.9644, Precision@10: 0.0964, Recall@10: 0.9644


In [6]:
#Model #1 -  XGBoost - RandomForest

dfresult = train_df.groupby('customer_id',as_index=False)['purchase_sequence'].max()
x_df = train_df.merge(dfresult.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='left',indicator=True).query("_merge == 'left_only'").drop(columns=['_merge'])
last_product = train_df[['customer_id','producto_encoded','purchase_sequence']].merge(dfresult.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='inner')
y_df = x_df[['customer_id']].merge(last_product.set_index('customer_id'), on=['customer_id'],how='left')[['customer_id','producto_encoded','purchase_sequence']]

dfresult_test = test_df.groupby('customer_id',as_index=False)['purchase_sequence'].max()
x_df_test = test_df.merge(dfresult_test.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='left',indicator=True).query("_merge == 'left_only'").drop(columns=['_merge'])
last_product_test = test_df[['customer_id','producto_encoded','purchase_sequence']].merge(dfresult_test.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='inner')
y_df_test = x_df_test[['customer_id']].merge(last_product_test.set_index('customer_id'), on=['customer_id'],how='left')[['customer_id','producto_encoded','purchase_sequence']]

# Aggregation functions for numeric features
agg_functions = {
    "producto_encoded": lambda x: list(x),  # Store product history as a list
    'education_level_encoded': "max",  # Average education level
    "aplicacion_encoded": "nunique",  # Number of unique apps used
    "income_group_scaled": "max",  # Average income
    "age_group_scaled": "max",  # Average age group
    "cantidad_transacciones_scaled": ["sum", "mean"],  # Sum & avg transactions
    "monto_transaccion_scaled": ["sum", "mean"],  # Sum & avg transaction amount
    "days_since_last_transaction_scaled": "min",  # Most recent transaction
    "days_since_product_opened_scaled": "max",  # Age of oldest product
    "purchase_sequence": "max",  # Last purchase sequence number
}

apply_columns_name = [
    "customer_id",
    "producto_encoded",
    'education_level_encoded',
    "aplicacion_encoded",
    "income_group_scaled",
    "age_group_scaled",
    "cantidad_transacciones_scaled",
    "cantidad_transacciones_scaled_mean",
    "monto_transaccion_scaled",
    "monto_transaccion_scaled_mean",
    "days_since_last_transaction_scaled",
    "days_since_product_opened_scaled",
    "purchase_sequence"
]

train_df_last_sequence = train_df.groupby('customer_id',as_index=False)['purchase_sequence'].max()
x_train_df = train_df.merge(train_df_last_sequence.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='left',indicator=True).query("_merge == 'left_only'").drop(columns=['_merge'])

test_df_last_sequence = test_df.groupby('customer_id',as_index=False)['purchase_sequence'].max()
x_test_df = test_df.merge(test_df_last_sequence.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='left',indicator=True).query("_merge == 'left_only'").drop(columns=['_merge'])

# Apply aggregation
x_train_df_agg = x_train_df.groupby("customer_id").agg(agg_functions,as_index=False).reset_index()
x_test_df_agg  = x_test_df.groupby("customer_id").agg(agg_functions,as_index=False).reset_index()

x_train_df_agg.columns = apply_columns_name
x_test_df_agg.columns = apply_columns_name

y_train_df = train_df[['customer_id','producto_encoded','purchase_sequence']].merge(train_df_last_sequence.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='inner')
y_test_df  = test_df[['customer_id','producto_encoded','purchase_sequence']].merge(test_df_last_sequence.set_index('customer_id'), on=['customer_id','purchase_sequence'],how='inner')

x_train_exploded_df = x_train_df_agg.explode('producto_encoded').reset_index(drop=True)
x_train_one_hot_encoded_df = pd.get_dummies(x_train_exploded_df, columns=['producto_encoded'], prefix='product', drop_first=True)
x_train_final_df = x_train_one_hot_encoded_df.groupby('customer_id').max().reset_index()

x_test_exploded_df = x_test_df_agg.explode('producto_encoded').reset_index(drop=True)
x_test_one_hot_encoded_df = pd.get_dummies(x_test_exploded_df, columns=['producto_encoded'], prefix='product', drop_first=True)
x_test_final_df = x_test_one_hot_encoded_df.groupby('customer_id').max().reset_index()

X = x_train_final_df.drop(columns=['customer_id'])
y = y_train_df['producto_encoded']
X_test = x_test_final_df.drop(columns=['customer_id'])
y_test = y_test_df['producto_encoded']

xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, enable_categorical=True)
xgb_model.fit(X,y)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# 6. Obtener probabilidades de predicción para X_test
y_proba = xgb_model.predict_proba(X_test)

# 7. Obtener las 5 clases con mayor probabilidad para cada muestra
top_5_indices = np.argsort(y_proba, axis=1)[:, -5:]  # Obtiene los índices de las 5 mayores probabilidades

# 8. Verificar cuántas veces el verdadero valor está en las 5 mejores predicciones
top_5_accuracy = np.mean([y_test.iloc[i] in top_5_indices[i] for i in range(len(y_test))])

# 9. Imprimir la precisión de top-5
print(f"XGBoost Top-5 Accuracy: {top_5_accuracy:.4f}")

# 10. Evaluar precisión
xgb_accuracy = xgb_model.score(X_test, y_test)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")

# 6. Obtener probabilidades de predicción para X_test
y_proba = rf_model.predict_proba(X_test)

# 7. Obtener las 5 clases con mayor probabilidad para cada muestra
top_5_indices = np.argsort(y_proba, axis=1)[:, -5:]  # Obtiene los índices de las 5 mayores probabilidades

# 8. Verificar cuántas veces el verdadero valor está en las 5 mejores predicciones
top_5_accuracy = np.mean([y_test.iloc[i] in top_5_indices[i] for i in range(len(y_test))])

# 9. Imprimir la precisión de top-5
print(f"Random Forest Top-5 Accuracy: {top_5_accuracy:.4f}")

# 10. Evaluar precisión
rf_accuracy = rf_model.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

XGBoost Top-5 Accuracy: 0.8861
XGBoost Accuracy: 0.5600
Random Forest Top-5 Accuracy: 0.8567
Random Forest Accuracy: 0.5439
