In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np

def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    from sklearn.model_selection import learning_curve
    plt.figure()
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt

data = pd.read_csv('HCV-Egy-Data.csv')

X = data.drop(columns=['Baselinehistological staging'])  
y = data['Baselinehistological staging']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42)
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

dt_grid = GridSearchCV(estimator=dt_model, param_grid=dt_param_grid, cv=5, scoring='accuracy')
dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)

display(HTML("<h2>Decision Tree Classifier</h2>"))
display(HTML("<h3>Mejores Hiperparámetros</h3>"))
display(pd.DataFrame({'Mejores Hiperparámetros': [dt_grid.best_params_]}))

display(HTML("<h3>Matriz de Confusión</h3>"))
display(pd.DataFrame(confusion_matrix(y_test, y_pred_dt),
                     index=[f"Clase {i}" for i in sorted(y.unique())],
                     columns=[f"Predicción {i}" for i in sorted(y.unique())]))

report_dt = classification_report(y_test, y_pred_dt, output_dict=True)
display(HTML("<h3>Reporte de Clasificación</h3>"))
display(pd.DataFrame(report_dt).transpose())

plot_learning_curve(best_dt, "Curva de Aprendizaje: Decision Tree", X_train, y_train, cv=5)
plt.show()

train_mean = np.mean(train_scores[-1])
test_mean = np.mean(test_scores[-1])

if train_mean > test_mean + 0.1:
    print("Diagnóstico: El modelo funciona mejor en entrenamiento que en validación.")
elif test_mean < 0.6:
    print("Diagnóstico: El modelo no generaliza bien.")
else:
    print("Diagnóstico: El modelo presenta un buen equilibrio entre entrenamiento y validación.")

print("\nConclusiones:")
print("1. El modelo de Decision Tree tiene una precisión de X% en los datos de prueba.")
print("2. Los hiperparámetros óptimos encontrados fueron:", best_dt.get_params())
print("3. Según la curva de aprendizaje, el modelo muestra que el modelo no generaliza bien")