# Tarea 05: Optimización de hiperparámetros

En la carpeta de data https://github.com/scidatmath2020/ML_Py_23/tree/main/data   encontrarás la tabla datos_películas.csv. Aplica todos los modelos de clasificación muticlase que hemos visto, con score F1, para encontrar el mejor clasificador.

¿Cuál es el mejor modelo para clasificar con esta tabla?

In [1]:
# Importamos las librerías estandar para manipular nuestro dataframe
import pandas as pd
import numpy as np
from siuba import *
from siuba.dply.vector import * 
from plotnine import *
import time

In [2]:
######### Importar datos ##########

# Importamos los datos de las películas desde Github
mi_data = pd.read_csv('https://raw.githubusercontent.com/scidatmath2020/ML_Py_23/main/data/datos_peliculas.csv')
# Visualizamos las primeras 5 filas
mi_data.head()

Unnamed: 0,pelicula,año,ratings,genero,ventas,presupuesto,secuela,vistas_youtube,positivos_youtube,negativos_youtube,comentarios,seguidores_agregados
0,13 Sins,2014,6.3,8,9130,4000000.0,1,3280543,4632,425,636,1120000.0
1,22 Jump Street,2014,7.1,1,192000000,50000000.0,2,583289,3465,61,186,12350000.0
2,3 Days to Kill,2014,6.2,1,30700000,28000000.0,1,304861,328,34,47,483000.0
3,300: Rise of an Empire,2014,6.3,1,106000000,110000000.0,2,452917,2429,132,590,568000.0
4,A Haunted House 2,2014,4.7,8,17300000,3500000.0,2,3145573,12163,610,1082,1923800.0


In [3]:
# Nuestra columna objetivo es 'genero', el cual nos describe el genero de la película. Al entrenar los
# modelos queremos predecir de qué genero es una película basado en las columnas numéricas.
# Checamos que clases tenemos en 'genero'
mi_data.genero.unique()

array([ 8,  1,  3, 10, 15, 12,  9,  2,  7,  6,  4], dtype=int64)

In [4]:
# Ahora chequemos si la columna 'genero' tiene clases balanceadas o desbalanceadas
mi_data >> group_by(_.genero) >> summarize(total = n(_))

Unnamed: 0,genero,total
0,1,65
1,2,12
2,3,46
3,4,1
4,6,3
5,7,2
6,8,54
7,9,13
8,10,12
9,12,13


Como vemos hay clases desbalanceadas en este dataset. Por ejemplo, los géneros '4', '6' o '7'. Será importante tomar esto en cuenta en lo
subsiguiente.

In [5]:
######### Modelos de clasificación multiclase ##########

# Importamos las librerías de sklearn que nos permiten clasificar datos multiclase
# Estos son: K-vecinos, árboles de decisión y máquinas de soporte vectorial
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

In [6]:
# Definimos como X al dataframe con los valores independientes. Para esto vamos a considerar todas los valores numéricos
# Como usaremos modelos que se basan en distancias, vamos a omitir las columnas 'pelicula', 'año' y 'secuela'
# Estos son todas las columnas excepto 'Species'.
X = mi_data >> select(-_.pelicula, -_.año, -_.secuela, -_.genero)

# Definimos como y al dataframe con los valores objetivo, los cuales se encuentran en la columna 'Species'
y =  mi_data >> select(_.genero)

In [7]:
# Las siguiente funciones son las vistas en clase. Aquí cambiamos
# 'f1_micro' que erea una medida para clases balanceadas, por 'f1_weighted'
# para clases desbalanceadas
def evaluar_modelo(estimador, X, y):
    resultados_estimador = cross_validate(estimador, X, y,
                                            scoring="f1_weighted", n_jobs=-1, cv=5)
    return resultados_estimador

def ver_resultados():
    resultados_df  = pd.DataFrame(resultados).T
    resultados_cols = resultados_df.columns
    for col in resultados_cols:
        resultados_df[col] = resultados_df[col].apply(np.mean)
        resultados_df[col+"_idx"] = resultados_df[col] / resultados_df[col].max()
    return resultados_df >> arrange(-_.test_score,_.fit_time)

resultados = {}

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [9]:
# Usamos GridSearchCV para modelos de árboles de decisión
model_tree = tree.DecisionTreeClassifier()
# Estos son los hiperparámetros que vamos a evaluar
parametros_tree = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [2, 3, 4, 5, 6, 7, 8]
}

tree_grid = GridSearchCV(estimator = model_tree, 
                    param_grid = parametros_tree,
                    scoring = "f1_weighted", n_jobs = -1)

start_time = time.time()
tree_grid.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(tree_grid.best_score_)
print(tree_grid.best_estimator_.get_params())

resultados["tree_gridsearch"] = evaluar_modelo(tree_grid.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 4.979994058609009 seconds
0.34793560145312036
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}




Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,1.0,1.0,1.0


In [11]:
# Usamos RandomizedSearchCV para modelos de árboles de decisión

tree_random = RandomizedSearchCV(estimator = model_tree, 
                     param_distributions = parametros_tree,
                    scoring = "f1_weighted", n_jobs = -1, n_iter=10)

start_time = time.time()
tree_random.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(tree_random.best_score_)
print(tree_random.best_estimator_.get_params())

resultados["tree_randomsearch"] = evaluar_modelo(tree_random.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()

Elapsed time: 0.1630854606628418 seconds
0.34793560145312036
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}




Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.737313,0.917,1.0
tree_randomsearch,0.009121,0.004381,0.344648,1.0,1.0,0.990552


In [12]:
# Usamos GridSearchCV para modelos de K-vecinos
model_knn = KNeighborsClassifier()
parametros_knn = {
    "n_neighbors": [1, 10, 20, 30, 40, 50],
    "p": [1, 2, 3],
    "weights": ["uniform", "distance"]
}

knn_grid = GridSearchCV(estimator = model_knn, 
                    param_grid = parametros_knn,
                    scoring = "f1_weighted", n_jobs = -1)

start_time = time.time()
knn_grid.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(knn_grid.best_score_)
print(knn_grid.best_estimator_.get_params())

resultados["knn_gridsearch"] = evaluar_modelo(knn_grid.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.4005906581878662 seconds
0.299490184708353
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}




Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.737313,0.873365,1.0
tree_randomsearch,0.009121,0.004381,0.344648,1.0,0.952415,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.462952,1.0,0.860763


In [13]:
# Usamos RandomizedSearchCV para modelos de K-vecinos
knn_random = RandomizedSearchCV(estimator = model_knn, 
                    param_distributions = parametros_knn,
                    scoring = "f1_weighted", n_jobs = -1, n_iter=10)

start_time = time.time()
knn_random.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(knn_random.best_score_)
print(knn_random.best_estimator_.get_params())

resultados["knn_randomsearch"] = evaluar_modelo(knn_random.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.18040108680725098 seconds
0.29442116230665394
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 30, 'p': 1, 'weights': 'distance'}


Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.737313,0.873365,1.0
tree_randomsearch,0.009121,0.004381,0.344648,1.0,0.952415,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.462952,1.0,0.860763
knn_randomsearch,0.005229,0.003928,0.294421,0.573314,0.854061,0.846194


In [14]:
# Usamos GridSearchCV para modelos con máquinas de soporte vectorial
# Primero vamos a evaluar los kernels polinomiales
model_svm = SVC()

parametros_svm_pol = {
    "degree": [1, 2, 3, 4],
    "kernel": ["poly"]
}

svm_grid_pol = GridSearchCV(estimator=model_svm, 
                        param_grid=parametros_svm_pol,
                        scoring="f1_weighted", n_jobs=-1)

start_time = time.time()
svm_grid_pol.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(svm_grid_pol.best_score_)
print(svm_grid_pol.best_estimator_.get_params())

resultados["svm_gridsearch_pol"] = evaluar_modelo(svm_grid_pol.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.21175861358642578 seconds
0.2536284213835907
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 1, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}




Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.570044,0.806755,1.0
tree_randomsearch,0.009121,0.004381,0.344648,0.773137,0.879776,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.357926,0.923733,0.860763
knn_randomsearch,0.005229,0.003928,0.294421,0.44325,0.788924,0.846194
svm_gridsearch_pol,0.011797,0.004979,0.253628,1.0,1.0,0.728952


In [15]:
# Usamos RandomizedSearchCV para modelos con máquinas de soporte vectorial
# con los kernels polinomiales
svm_random_pol = RandomizedSearchCV(estimator=model_svm, 
                        param_distributions=parametros_svm_pol,
                        scoring="f1_weighted", n_jobs=-1, n_iter=10)

start_time = time.time()
svm_random_pol.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(svm_random_pol.best_score_)
print(svm_random_pol.best_estimator_.get_params())

resultados["svm_randomsearch_pol"] = evaluar_modelo(svm_random_pol.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.3202941417694092 seconds
0.2536284213835907
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 1, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}




Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.556202,0.801139,1.0
tree_randomsearch,0.009121,0.004381,0.344648,0.754364,0.873652,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.349235,0.917302,0.860763
knn_randomsearch,0.005229,0.003928,0.294421,0.432487,0.783432,0.846194
svm_gridsearch_pol,0.011797,0.004979,0.253628,0.975718,0.993039,0.728952
svm_randomsearch_pol,0.012091,0.005014,0.253628,1.0,1.0,0.728952


In [16]:
# Usamos GridSearchCV para modelos con máquinas de soporte vectorial
# Ahora vamos a evaluar los kernel gaussianos

parametros_svm_rbf = {
    "gamma": [0.1, 1.0, 10],
    "kernel": ["rbf"]
}

svm_grid_rbf = GridSearchCV(estimator=model_svm, 
                        param_grid=parametros_svm_rbf,
                        scoring="f1_weighted", n_jobs=-1)

start_time = time.time()
svm_grid_rbf.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(svm_grid_rbf.best_score_)
print(svm_grid_rbf.best_estimator_.get_params())

resultados["svm_gridsearch_rbf"] = evaluar_modelo(svm_grid_rbf.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.13820385932922363 seconds
0.12360317136779608
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.268023,0.490378,1.0
tree_randomsearch,0.009121,0.004381,0.344648,0.363513,0.534763,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.168289,0.561481,0.860763
knn_randomsearch,0.005229,0.003928,0.294421,0.208407,0.479539,0.846194
svm_gridsearch_pol,0.011797,0.004979,0.253628,0.47018,0.60784,0.728952
svm_randomsearch_pol,0.012091,0.005014,0.253628,0.48188,0.612101,0.728952
svm_gridsearch_rbf,0.025091,0.008192,0.123603,1.0,1.0,0.355247


In [17]:
# Usamos RandomizedSearchCV para modelos con máquinas de soporte vectorial
# con los kernel gaussianos

svm_random_rbf = RandomizedSearchCV(estimator=model_svm, 
                        param_distributions=parametros_svm_rbf,
                        scoring="f1_weighted", n_jobs=-1, n_iter=10)

start_time = time.time()
svm_random_rbf.fit(X, y.values.ravel())
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

print(svm_random_rbf.best_score_)
print(svm_random_rbf.best_estimator_.get_params())

resultados["svm_randomsearch_rbf"] = evaluar_modelo(svm_random_rbf.best_estimator_,
                                             X,
                                             y.values.ravel())

ver_resultados()



Elapsed time: 0.14735865592956543 seconds
0.12360317136779608
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Unnamed: 0,fit_time,score_time,test_score,fit_time_idx,score_time_idx,test_score_idx
tree_gridsearch,0.006725,0.004017,0.347936,0.268023,0.490378,1.0
tree_randomsearch,0.009121,0.004381,0.344648,0.363513,0.534763,0.990552
knn_gridsearch,0.004223,0.004599,0.29949,0.168289,0.561481,0.860763
knn_randomsearch,0.005229,0.003928,0.294421,0.208407,0.479539,0.846194
svm_gridsearch_pol,0.011797,0.004979,0.253628,0.47018,0.60784,0.728952
svm_randomsearch_pol,0.012091,0.005014,0.253628,0.48188,0.612101,0.728952
svm_randomsearch_rbf,0.021033,0.006698,0.123603,0.838275,0.817646,0.355247
svm_gridsearch_rbf,0.025091,0.008192,0.123603,1.0,1.0,0.355247


De este ejercicio se concluye que el mejor modelo para ajustar datos_peliculas.csv es árboles de decisión usando como hiperparámetros:

'criterion': 'entropy' 

'max_depth': 3