In [None]:
# Instalar las librerías necesarias
!pip install pandas numpy scikit-learn imbalanced-learn openpyxl




In [None]:
from google.colab import files
import pandas as pd

# Subir el archivo
uploaded = files.upload()

# Leer el archivo Excel en un DataFrame de pandas
data = pd.read_excel('churn_data.xlsx')


Saving churn_data.xlsx to churn_data.xlsx


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

def preprocess_data(data):
    # Suponiendo que la columna 'Churn' es la variable objetivo
    X = data.drop('Churn', axis=1)
    y = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

    # Convertir variables categóricas a variables dummy
    X = pd.get_dummies(X)

    # Contar el número de muestras en la clase minoritaria
    class_counts = y.value_counts()
    minority_class_count = class_counts.min()

    # Ajustar el número de vecinos para SMOTE
    k_neighbors = min(5, minority_class_count - 1)  # al menos 1 muestra para el vecino

    # Sobremuestreo de la clase minoritaria
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_res, y_res = smote.fit_resample(X, y)

    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

    # Estandarizar las características
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

def build_and_train_model(X_train, y_train):
    # Hiperparámetros para RandomForest
    param_grid_rf = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    # Hiperparámetros para Gradient Boosting
    param_grid_gb = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    gb = GradientBoostingClassifier(random_state=42)
    grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search_gb.fit(X_train, y_train)

    # Selección del mejor modelo basado en el mejor score de validación
    if grid_search_rf.best_score_ > grid_search_gb.best_score_:
        best_model = grid_search_rf.best_estimator_
    else:
        best_model = grid_search_gb.best_estimator_

    return best_model


In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

def evaluate_model(model, X_test, y_test, X_train, y_train):
    # Hacer predicciones
    y_pred = model.predict(X_test)

    # Imprimir el reporte de clasificación
    print(classification_report(y_test, y_pred))

    # Calcular y imprimir la AUC-ROC
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    print(f"AUC-ROC: {auc_roc:.2f}")

    # Validación cruzada con accuracy para evitar problemas con AUC-ROC
    cv = StratifiedKFold(n_splits=3)  # Reducción a 3 pliegues
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f'Cross-Validation Accuracy scores: {scores}')
    print(f'Mean Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}')


In [None]:
# Cargar datos desde un archivo Excel
data = pd.read_excel('churn_data.xlsx')

# Preprocesar los datos
X_train, X_test, y_train, y_test = preprocess_data(data)

# Construir y entrenar el modelo
model = build_and_train_model(X_train, y_train)

# Evaluar el modelo
evaluate_model(model, X_test, y_test, X_train, y_train)


              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

AUC-ROC: 1.00
Cross-Validation Accuracy scores: [0.33333333 0.66666667 0.33333333]
Mean Accuracy: 0.44 ± 0.16
