**Objetivos:** 
- Implementar los modelos vistos en clase tanŧo de regresión como de claificación sobre el data set de galaxias 
- Realizar un análisis de los resultados obtenidos que cada una de las pruebas
- Comparar la performance y seleccionar el mejor modelo tanto para tabla como para imágenes

# Paquetes necesarios 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,6)

In [None]:
dataset = pd.read_csv('galaxias_2.csv', index_col=['objID'])
display(dataset.head(2))
display(dataset.shape)
display('distribución de las variables físicas de las galaxias')
dataset.hist()
plt.tight_layout()

# Analisis de Datos

In [None]:
dataset.columns

In [None]:
data_cl = dataset.loc[~(dataset.index.astype(str).duplicated(keep="first"))]
data_cl.shape

In [None]:
def galaxy_type(row):
    if row["elliptical"]:
        return "E"
    elif row["spiral"]:
        return "S"
    else:
        return "I"

In [None]:
data_cl["Type"] = data_cl.apply(galaxy_type, axis=1)

In [None]:
sns.countplot(data_cl["Type"])

In [None]:
sns.heatmap(data_cl.isna(), yticklabels=False)

In [None]:
def distribution_per_type(df, col_name="", bins=20):
    plt.title(f"{col_name.capitalize()} Distribution")
    sns.distplot(df[df["elliptical"] == 1][col_name],label="elliptical", bins=bins)
    sns.distplot(df[df["spiral"] == 1][col_name],label="spiral", bins=bins)
    sns.distplot(df[df["uncertain"] == 1][col_name],label="irregular", bins=bins)
    plt.legend()
    
def exploratory_plots(df, col_name=""):
    plt.subplot(3, 1, 1)
    distribution_per_type(df, col_name)
    plt.subplot(3, 1, 2)
    plt.title(f"{col_name.capitalize()} Boxplot")
    sns.boxplot(x="Type", y=col_name, data=df)

    plt.subplot(3, 1, 3)
    plt.title(f"{col_name.capitalize()} Boxplot w/o Outliers")
    sns.boxplot(x="Type", y=col_name, data=df, showfliers=False)
    
    plt.tight_layout()

## Pair Plot

In [None]:
data_cl.columns

In [None]:
plot_cols = ['petroR90_r', 'Color', 'distancia_L', 'Mag_abs', 'Type']
sns.pairplot(data_cl[plot_cols], hue="Type")

## Color

In [None]:
exploratory_plots(data_cl, "Color")

## PetroR90

In [None]:
exploratory_plots(data_cl, "petroR90_r")

## Distancia L

In [None]:
exploratory_plots(data_cl, "distancia_L")

## Mag_abs

In [None]:
exploratory_plots(data_cl, "Mag_abs")

In [None]:
for mag in ['modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i','modelMag_z']:
    plt.figure()
    exploratory_plots(data_cl, mag)

# Regresión 


Existe una relación empírica entre el radio efectivo (petro petroR90_r) y la magnitud absoluta para galaxias  (datos de Bender et al. 1992, ApJ., 399, 462)

Por lo tanto, el valor a predecir sera  **Mag_abs** la cual está contenida en un intervalo real de tamaño ~ 9mag, el atributo a usar sera el logaritmo en base diez de la variable **petroR90_r**.

Detallar los pasos realizados (split, fit, metrica..) en el uso de el algoritmo de Regresión lineal con y sin regularización.

Responda:
- Que significa el error cuadrático?
- Que unidades tiene?
- Es necesaria la regularización?
- Es bueno valor obtenido en la métrica?
- Que pasa si se distingue por tipo de galaxia?

In [None]:
data_cl["log_petroR90"] = np.log10(data_cl["petroR90_r"])

In [None]:
f, axs = plt.subplots(1,2)
sns.distplot(data_cl["log_petroR90"], ax=axs[0])
sns.scatterplot(x="log_petroR90", y="Mag_abs", data=data_cl,hue="Type", ax=axs[1])

## Train / Validation / Test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, f1_score

In [None]:
X = data_cl["log_petroR90"]
y = data_cl["Mag_abs"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42421)
X_train, X_test = X_train.values.reshape(-1,1), X_test.values.reshape(-1,1)

In [None]:
sns.distplot(y)

In [None]:
y.describe()

## Modelo Lineal y Regularización

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

model_lasso = Lasso(alpha=0.01, random_state=4212)
model_lasso.fit(X_train, y_train)
#list(zip(X.columns, model.coef_))

In [None]:
y_pred = model.predict(X_test)
y_pred_lasso = model_lasso.predict(X_test)

In [None]:
print("-"*80)
print("Linear Regression")
print("R2 Score: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("SE:  ", mean_squared_error(y_test, y_pred) * y_test.shape[0])
print("-"*80)
print("Linear Regression with Lasso Regularization")
print("R2 Score: ", r2_score(y_test, y_pred_lasso))
print("MSE: ", mean_squared_error(y_test, y_pred_lasso))
print("SE:  ", mean_squared_error(y_test, y_pred_lasso) * y_test.shape[0])

In [None]:
sns.scatterplot(x=X_test.reshape(1,-1)[0], y=y_test, label="Test Target")
sns.scatterplot(x=X_test.reshape(1,-1)[0], y=y_pred, label="Predicted Target")
plt.xlabel("log_petroR90")
plt.legend()
plt.grid()

In [None]:
sns.scatterplot(x=y_pred, y=y_test)
plt.xlabel("Predicted Target")
plt.ylabel("Test Target")
plt.grid()

## Modelo por tipo de galaxia

In [None]:
X_d, y_d = {}, {}
X_train_d, X_test_d, y_train_d, y_test_d = {}, {}, {}, {}

for ttype in data_cl["Type"].unique():
    mask = data_cl["Type"] == ttype
    X_d[ttype] = data_cl[mask]["log_petroR90"]
    y_d[ttype] = data_cl[mask]["Mag_abs"]

    X_train_d[ttype], X_test_d[ttype], y_train_d[ttype], y_test_d[ttype] = \
    train_test_split(X_d[ttype], y_d[ttype], test_size=0.2, random_state=42421)
    X_train_d[ttype], X_test_d[ttype] = X_train_d[ttype].values.reshape(-1,1), X_test_d[ttype].values.reshape(-1,1)

In [None]:
models = {}
y_pred_train_d = {}
y_pred_test_d  = {}
for ttype in data_cl["Type"].unique():
    models[ttype] = LinearRegression()
    models[ttype].fit(X_train_d[ttype], y_train_d[ttype])
    y_pred_train_d[ttype] = models[ttype].predict(X_train_d[ttype])
    y_pred_test_d[ttype] = models[ttype].predict(X_test_d[ttype])

for ttype in data_cl["Type"].unique():
    print("-"*80)
    print(ttype)
    print("-"*80)
    print("R2 Score: ", r2_score(y_test_d[ttype], y_pred_test_d[ttype]))
    print("MSE: ", mean_squared_error(y_test_d[ttype], y_pred_test_d[ttype]))
    print(models[ttype].coef_)

In [None]:
for ttype in data_cl["Type"].unique()[::-1]:
    sns.scatterplot(x=y_pred_test_d[ttype], y=y_test_d[ttype], alpha=0.8, label=ttype)
    plt.grid()
    plt.legend()

In [None]:
f, axs = plt.subplots(3,1, figsize=(10,10))
for idx, ttype in enumerate(data_cl["Type"].unique()):
    sns.scatterplot(y=y_test_d[ttype],      x=X_test_d[ttype].reshape(1,-1)[0], ax=axs[idx],
                   label="Predicted Target")
    sns.scatterplot(y=y_pred_test_d[ttype], x=X_test_d[ttype].reshape(1,-1)[0], ax=axs[idx], 
                    label="Test Target")
    
    axs[idx].set_xlabel("log_petroR90")
    axs[idx].set_ylabel("Mag_Abs")
    axs[idx].legend()
    axs[idx].grid()
    axs[idx].set_title(ttype)
    axs[idx].set_ylim([-23, -16])

plt.tight_layout()

In [None]:
f, axs = plt.subplots(3,1, figsize=(10,10))
for idx, ttype in enumerate(data_cl["Type"].unique()):
    sns.scatterplot(x=y_pred_test_d[ttype], y=y_test_d[ttype], ax=axs[idx])
    axs[idx].grid()
    axs[idx].set_title(ttype)
    axs[idx].set_xlabel("Predicted Target")
    axs[idx].set_ylabel("Test Target")
plt.tight_layout()

## Respuestas

# Clasificación 

## Clasificación binaria 

Haga uso de los atributos petroR90_r, Color y Mag_abs para clasificar en  elípticas y espirales.

Use los siguientes modelos:

    - Perceptrón
    - Regresión logística
    - Vecinos más cercanos
    
Para cada uno de ellos muestre:

        - Matriz de confusión
        - Visualización de la frontera de decisión

In [None]:
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
data_clf = data_cl[data_cl["Type"] != "I"]
sns.countplot(data_clf["Type"])

In [None]:
X_clf = data_clf[["petroR90_r", "Mag_abs", "Color"]]
y_clf = data_clf["Type"]

X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42421)
#X_train, X_test = X_train.values.reshape(-1,1), X_test.values.reshape(-1,1)

In [None]:
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std   = std.transform(X_test)

In [None]:
def print_classification_report(y_real, y_pred):
    print(f"Accuracy {accuracy_score(y_real, y_pred)}")
    print("-"*100)
    print(classification_report(y_real, y_pred))
    plt.figure()
    sns.heatmap(confusion_matrix(y_real, y_pred), 
                #xticklabels=x_ticks, #[0, 1],
                #yticklabels=x_ticks, #[0, 1],
                cmap="jet",
                annot=True,
               )
    plt.xlabel("Predicted Class")
    plt.ylabel("Real Class")
    plt.show()

In [None]:
perc    = Perceptron(random_state=421)
log_reg = LogisticRegression(random_state=421)
knn     = KNeighborsClassifier(n_neighbors=5)

### Hiper parametros por defecto

In [None]:
for model in [perc, log_reg, knn]:
    model.fit(X_train_std, y_train)

In [None]:
for model in [perc, log_reg, knn]:
    y_pred_train = model.predict(X_train_std)
    y_pred_test = model.predict(X_test_std)
    print("="*80)
    print(model.__class__.__name__)
    print("="*80)
    print("Accuracy Train: ", accuracy_score(y_train, y_pred_train))
    print("F1 score Train: ", f1_score(y_train, y_pred_train, pos_label = 'S'))
    print("Accuracy Test: ", accuracy_score(y_test, y_pred_test))
    print("F1 score Test: ", f1_score(y_test, y_pred_test, pos_label = 'S'))

In [None]:
for model in [log_reg, knn]:
    y_pred_train = model.predict(X_train_std)
    y_pred_test = model.predict(X_test_std)
    print("="*80)
    print(model.__class__.__name__)
    print("="*80)
    print("Train")
    print("-"*80)
    print_classification_report(y_train, y_pred_train)
    print("-"*80)
    print("Test")
    print("-"*80)
    print_classification_report(y_test, y_pred_test)

### Ajuste Hiper Parametros

Como no hay muchos hiperparametros vamos a usar validation curve

In [None]:
from sklearn.model_selection import validation_curve

In [None]:
x_logspace =  np.logspace(-6,1,7)
train_scores, valid_scores =  validation_curve(LogisticRegression(solver="lbfgs"),
                                               X_train_std, y_train, "C", x_logspace,
                                               cv=5)

In [None]:
plt.semilogx(x_logspace, np.mean(train_scores,axis=1), "-ob", label="Train")
plt.semilogx(x_logspace, np.mean(valid_scores,axis=1), "-xr", label="Validation")
plt.grid()
plt.xlabel("Regularization Coeficient")
plt.legend(loc=0)

In [None]:
x_logspace =  range(1, 15)
train_scores, valid_scores =  validation_curve(KNeighborsClassifier(),
                                               X_train_std, y_train, "n_neighbors", x_logspace,
                                               cv=5, n_jobs=-1)

In [None]:
plt.plot(x_logspace, np.mean(train_scores,axis=1), "-ob", label="Train")
plt.plot(x_logspace, np.mean(valid_scores,axis=1), "-xr", label="Validation")
plt.grid()
plt.legend(loc=0)

Vemos que pasa con el test cuando usamos nneighbors = 9

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_std, y_train)
y_pred_train = knn.predict(X_train_std)
y_pred_test = knn.predict(X_test_std)
print("="*80)
print(knn.__class__.__name__)
print("="*80)
print("Train")
print("-"*80)
print_classification_report(y_train, y_pred_train)
print("-"*80)
print("Test")
print("-"*80)
print_classification_report(y_test, y_pred_test)

### Curvas Precision/Recall y ROC

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder

y_scores = cross_val_predict(LogisticRegression(solver="lbfgs"),
                                               X_train_std, y_train ,cv=5,
                                               method="decision_function")


In [None]:
prec, rec, th = precision_recall_curve(LabelEncoder().fit_transform(y_train), y_scores)
plt.plot(th, prec[:-1], "--b", label="Precision")
plt.plot(th, rec[:-1], "--r", label="Recall")
plt.legend()
plt.grid()
plt.xlabel("Threshold")

In [None]:
fpr, tpr, th = roc_curve(LabelEncoder().fit_transform(y_train), y_scores)

plt.plot(fpr, tpr, label="ROC Log Regr")
plt.plot([0,1],[0,1], "--k")
plt.axis([0,1,0,1])
plt.xlabel("False Poisitive Rate")
plt.ylabel("True Poisitive Rate")
plt.grid()


### Fronteras

In [None]:
from matplotlib.colors import ListedColormap
from ml.visualization import plot_confusion_matrix, classifier_boundary

## Clasificación multiclase
Haga uso de los atributos petroR90_r, Color y Mag_abs para clasificar en elípticas, espirales e irregulares.

Use los siguientes modelos:

    - SGDClassifier con y sin Ajuste de Hiperparámetros
    - Árbol de Decisión con y sin Ajuste de Hiperparámetros
    
Para cada uno de ellos muestre:

        - Accuracy
        - Precision
        - Recall
        - F1
        - matriz de confusión

        

Responda: 
- Que métrica es la más apropiadad a usar en este problema de clasificación?

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
data_clf = data_cl
X_clf = data_clf[["petroR90_r", "Mag_abs", "Color"]]
y_clf = data_clf["Type"]
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42421)

std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std   = std.transform(X_test)

### Parámetros por defecto

In [None]:
sgd_clf    = SGDClassifier(random_state=421)
tree_clf = DecisionTreeClassifier(random_state=421)

In [None]:
for model in [sgd_clf,tree_clf]:
    model.fit(X_train_std, y_train)

In [None]:
for model in [sgd_clf,tree_clf]:
    y_pred_train = model.predict(X_train_std)
    y_pred_test = model.predict(X_test_std)
    print("="*80)
    print(model.__class__.__name__)
    print("="*80)
    print("Accuracy Train: ", accuracy_score(y_train, y_pred_train))
    print("F1-score Train: ", f1_score(y_train, y_pred_train, average = 'weighted'))
    print("Accuracy Test: ", accuracy_score(y_test, y_pred_test))
    print("F1-score Test: ", f1_score(y_train, y_pred_train, average = 'weighted'))
    #print("Train")
    #print("-"*80)
    #print_classification_report(y_train, y_pred_train)
    #print("-"*80)
    #print("Test")
    #print("-"*80)
    #print_classification_report(y_test, y_pred_test)

### Ajuste de Hiper Parametros

In [None]:
from sklearn.model_selection import GridSearchCV

#### SGD

In [None]:
param_grid = {
    "loss":['hinge', 'log', "perceptron"],
    "penalty": ["l1", "l2", None],
    "alpha": [1e-5, 1e-4, 1e-3],
    }
sgd_clf    = SGDClassifier(random_state=2402)
grid_sgd = GridSearchCV(sgd_clf, param_grid=param_grid, cv=5, scoring="accuracy")

In [None]:
grid_sgd.fit(X_train_std, y_train)
means = grid_sgd.cv_results_['mean_test_score']
stds = grid_sgd.cv_results_['std_test_score']
for mean, std, params in sorted(zip(means, stds, grid_sgd.cv_results_['params']), 
                                key=lambda data: data[0], reverse=True):
    print("%0.4f (+/-%0.04f) para %r" % (mean, std * 2, params))

In [None]:
best_sgd = grid_sgd.best_estimator_
best_sgd.fit(X_train_std, y_train)

y_pred_train = best_sgd.predict(X_train_std)
y_pred_test = best_sgd.predict(X_test_std)
print("="*80)
print(best_sgd.__class__.__name__)
print("="*80)
print("Train: ", accuracy_score(y_train, y_pred_train))
print("Test: ", accuracy_score(y_test, y_pred_test))
print("Train")
print("-"*80)
print_classification_report(y_train, y_pred_train)
print("-"*80)
print("Test")
print("-"*80)
print_classification_report(y_test, y_pred_test)

#### Arbol de Decisión

In [None]:
param_grid = {
    "criterion":['gini', 'entropy'],
    "max_depth": [3, 4, 5, 7],
    "min_samples_leaf": [1, 2, 3, 4, 5, 6],
    }
tree_clf = DecisionTreeClassifier(random_state=421)
grid_tree = GridSearchCV(tree_clf, param_grid=param_grid, cv=5, scoring="accuracy")

In [None]:
grid_tree.fit(X_train_std, y_train)
means = grid_tree.cv_results_['mean_test_score']
stds = grid_tree.cv_results_['std_test_score']
for mean, std, params in sorted(zip(means, stds, grid_tree.cv_results_['params']), 
                                key=lambda data: data[0], reverse=True):
    print("%0.4f (+/-%0.04f) para %r" % (mean, std * 2, params))
print()

In [None]:
best_tree = grid_tree.best_estimator_
best_tree.fit(X_train_std, y_train)

y_pred_train = best_tree.predict(X_train_std)
y_pred_test = best_tree.predict(X_test_std)
print("="*80)
print(model.__class__.__name__)
print("="*80)
print("Train: ", accuracy_score(y_train, y_pred_train))
print("Test: ", accuracy_score(y_test, y_pred_test))
print("Train")
print("-"*80)
print_classification_report(y_train, y_pred_train)
print("-"*80)
print("Test")
print("-"*80)
print_classification_report(y_test, y_pred_test)

### Otras métricas

La matriz de confusión tiene la forma<br/>

| X | Positives | Negatives |
| --- | --- | --- |
| **Positives** | True Positives  | False Positives |
| **Negatives** | False Negatives | True Negatives  |

Una metrica que nos dice el accuracy que tenemos sobre los valores verdaderos es **precision**. Nos dice cuantos de los valores que detectamos como verdaderos realmente lo son:

$precision = \frac{TP}{TP+FP}$

Otra metrica interesante es **recall**. Nos dice que tan bueno fue el algoritmo para detectar bien a los positivos.

$recall = \frac{TP}{TP+FN}$

El **F1 score** es un promedio armonico de los 2 anteriores y solamnete va a tener un valor alto cuando los otros 2 sean altos

$F1 = 2 \times \frac{precision \times recall}{precision + recall}$

Utilizamos F1 como score porque las clases se encuentran desbalanceadas en este caso y ademas no tenemos especial interes en ninguna clase.

##### Score: f1-score

In [None]:
param_grid = {
    "criterion":['gini', 'entropy'],
    "max_depth": [3, 4, 5, 7],
    "min_samples_leaf": [1, 2, 3, 4, 5, 6],
    }
tree_clf = DecisionTreeClassifier(random_state=421)
grid_tree = GridSearchCV(tree_clf, param_grid=param_grid, cv=5, scoring="f1_weighted")
grid_tree.fit(X_train_std, y_train)
means = grid_tree.cv_results_['mean_test_score']
stds = grid_tree.cv_results_['std_test_score']
for mean, std, params in sorted(zip(means, stds, grid_tree.cv_results_['params']), 
                                key=lambda data: data[0], reverse=True):
    print("%0.4f (+/-%0.04f) para %r" % (mean, std * 2, params))

In [None]:
best_tree = grid_tree.best_estimator_
best_tree.fit(X_train_std, y_train)

y_pred_train = best_tree.predict(X_train_std)
y_pred_test = best_tree.predict(X_test_std)
print("="*80)
print(best_tree.__class__.__name__)
print("="*80)
print("Train: ", accuracy_score(y_train, y_pred_train))
print("Test: ", accuracy_score(y_test, y_pred_test))
print("Train")
print("-"*80)
print_classification_report(y_train, y_pred_train)
print("-"*80)
print("Test")
print("-"*80)
print_classification_report(y_test, y_pred_test)

## Veamos ahora que sucede al balancear las clases

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_colwidth', -1)

In [None]:
# NOTE: to the selected model we must be able to apply the fit method. Either way we can't use this function.

def cv_and_smote(clf, x_train, y_train, x_test, rnd = 22):
    warnings.filterwarnings('ignore')
    
    x_train_df = pd.DataFrame(x_train) 
    y_train_df = pd.DataFrame(y_train)
    
    kf = StratifiedKFold(n_splits=5, random_state=rnd)

    # lists to append scores
    cross_val_f1_score_lst = []
    cross_val_accuracy_lst = []
    cross_val_recall_lst = []
    cross_val_precision_lst = []

    for train_index_ls, validation_index_ls in kf.split(x_train, y_train):
    
        # splitting on train/validation    
        train, validation = x_train_df.iloc[train_index_ls], x_train_df.iloc[validation_index_ls]
        target_train, target_val = y_train_df.iloc[train_index_ls], y_train_df.iloc[validation_index_ls]
    
        sm = SMOTE(random_state=rnd)
    
        X_train_res, y_train_res = sm.fit_sample(train, target_train)
    
        # model definition
        clf.fit(X_train_res, y_train_res)
        
        # testing on 1 fold of validation set
        validation_pred = clf.predict(validation)
    
        # appending scores of differnet metrics
        cross_val_recall_lst.append(recall_score(target_val, validation_pred, average = 'macro'))
        cross_val_accuracy_lst.append(accuracy_score(target_val, validation_pred))
        cross_val_precision_lst.append(precision_score(target_val, validation_pred, average = 'macro'))
        cross_val_f1_score_lst.append(f1_score(target_val, validation_pred, average = 'macro'))
           
    y_pred_test = clf.predict(x_test) 
    
    return (np.mean(cross_val_accuracy_lst), 
            np.mean(cross_val_recall_lst), 
            np.mean(cross_val_precision_lst), 
            np.mean(cross_val_f1_score_lst), 
            y_pred_test)


# Imágenes

Por ultimo, 
- Armen un conjunto de 100 imágenes [300, 300, 5] (guarden ese conjunto de datos. img.formato)
- Elijan el modelo con la mejor performance y apliquen sobre el conjunto de imágenes. 

- Dejen un análisis de las performas que obtiene con ese modelo tanto para imágenes como para valores de tabla. 
- Está bn realizar este procedimiento en este tipo de problema?. Es decir, tomar un modelo que clasifica bien a el mismo conjunto de galaxias usando datos de tabla

## Conclusión