In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
import pickle



from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import F1Score
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import L2
from tensorflow import multiply
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
from tensorflow.random import set_seed


Realizaremos una búsqueda aleatorio de los mejores hiperparámetros, esto es debido a que una búsqueda más exhaustiva para un dataset tan grande es demasiado ineficiente.

In [2]:
def regresionLogistica(X, y, n, v_cruzada, seed):
    log_reg = LogisticRegression(max_iter=500)
    parametros_log_reg = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
    busqueda_log_reg = RandomizedSearchCV(estimator=log_reg, param_distributions=parametros_log_reg, n_iter=n, \
                                      scoring='f1_weighted', cv=v_cruzada, verbose=1, random_state=seed, n_jobs=-1)
    
    busqueda_log_reg.fit(X, y)
    parametros = busqueda_log_reg.best_params_
    print(f'Los mejores parámetros son: {parametros} con una puntuación de {busqueda_log_reg.best_score_}')

    modelo = LogisticRegression(**parametros)
    scores = cross_val_score(modelo, X, y, cv=v_cruzada, scoring='f1_weighted')
    return scores.mean()


def SGDC(X, y, n, v_cruzada, seed):
    sgd = SGDClassifier()
    parametros_sgdc = {'loss': ['hinge', 'squared_error', 'log_loss'], 'alpha': np.logspace(-5, 0, 10),\
                       'max_iter': [100, 200], 'tol': [1e-3, 1e-4]}
    busqueda_sgdc = RandomizedSearchCV(estimator=sgd, param_distributions=parametros_sgdc, n_iter=n,\
                                       scoring='f1_weighted', cv=v_cruzada, verbose=1, random_state=seed, n_jobs=-1)
    
    busqueda_sgdc.fit(X, y)
    parametros = busqueda_sgdc.best_params_
    print(f'Los mejores parámetros son: {parametros} con una puntuación de {busqueda_sgdc.best_score_}')

    modelo = SGDClassifier(**parametros)
    scores = cross_val_score(modelo, X, y, cv=v_cruzada, scoring='f1_weighted')
    return scores.mean()

def bosqueAleatorio(X, y, n, v_cruzada, seed):
    rf = RandomForestClassifier()
    parametros_rf = {'n_estimators': np.arange(100, 200, 20), 'max_features': ['sqrt', 'log2']+list(range(1, 10, 3)), \
                     'max_depth': list(np.arange(5, 20, 5)), 'min_samples_split': np.arange(5, 20, 5)}
    busqueda_rf = RandomizedSearchCV(estimator=rf, param_distributions=parametros_rf, n_iter=n, scoring='f1_weighted',\
                                   cv = v_cruzada, verbose=1, random_state=seed, n_jobs=-1)
    
    busqueda_rf.fit(X, y)
    parametros = busqueda_rf.best_params_
    print(f'Los mejores parámetros son: {parametros} con una puntuación de {busqueda_rf.best_score_}')

    modelo = RandomForestClassifier(**parametros)
    scores = cross_val_score(modelo, X, y, cv=v_cruzada, scoring='f1_weighted')
    return scores.mean()


def arbolDecision(X, y, n, v_cruzada, seed):
    dtc = DecisionTreeClassifier()
    parametros_dtc = {'max_depth': list(np.arange(20, 50, 2)), 'min_samples_split': np.arange(10, 20, 2), 'criterion': ['gini', 'entropy']}
    busqueda_dtc = RandomizedSearchCV(estimator=dtc, param_distributions=parametros_dtc, n_iter=n, scoring='f1_weighted',\
                                      cv=v_cruzada, verbose=1, random_state=seed, n_jobs=-1)
    
    busqueda_dtc.fit(X, y)
    parametros = busqueda_dtc.best_params_
    print(f'Los mejores parámetros son: {parametros} con una puntuación de {busqueda_dtc.best_score_}')

    modelo = DecisionTreeClassifier(**parametros)
    scores = cross_val_score(modelo, X, y, cv=v_cruzada, scoring='f1_weighted')
    return scores.mean()

def redNeuronal(X, y, n, v_cruzada, seed):
    set_seed(seed)
    def design_model(n_features, lr):
        input_ = Input(shape=(n_features,))
        layer1 = Dropout(0.3)(input_)
        layer2 = Dense(128, kernel_regularizer=L2)(layer1)
        layer3 = Dropout(0.3)(layer2)
        layer4 = Dense(128, kernel_regularizer=L2)(layer3)
        output = Dense(1, activation="sigmoid")(layer4)

        # model
        model = Model(inputs=input_, outputs=output)
        model.compile(loss=BinaryCrossentropy, metrics=[F1Score], optimizer = Adam(learning_rate=lr))
        return model
    
    n_features = X.shape[-1]
    stop = EarlyStopping(monitor='val_loss', mode='min', patience=50, restore_best_weights=True)
    parameter_grid = product((0.1, 0.01, 0.001), (16, 64, 256))

    results = list()
    for comb in parameter_grid:
        model = design_model(n_features, lr=comb[0])
        history = model.fit(X,y,epochs=500, batch_size=comb[1], validation_split=0.2,
                        callbacks = [stop], verbose=0)
        results.append((comb[0], comb[1], history.history['f1_score'][-1]))
    
    best_param = max(sorted(results, key = lambda x: x[2]))
    print(f'Los mejores parámetros son: learning_rate: {best_param[0]}, num_batches: {best_param[1]} con una puntuación de {best_param[2]}')

    model = design_model(n_features, lr=best_param[0])
    scores = np.empty(v_cruzada)
    n_samples = X.shape[0]
    jump = int(n_samples/v_cruzada)
    for i in range(v_cruzada):
        val_idx = range(i*jump,(i+1)*jump)
        model.fit(X.drop(X.iloc[val_idx].index), y.drop(y.iloc[val_idx].index),epochs=500, batch_size=comb[1], validation_data=(X.iloc[val_idx],y.iloc[val_idx]),
                callbacks = [stop], verbose=0)
        scores[i] = stop.best

    return scores.mean()


def KNN(X, y, n, v_cruzada, seed):
    knn = KNeighborsClassifier()
    parametros_knn = {'n_neighbors': np.arange(1, 15, 3), 'weights': ['uniform', 'distance'],\
                      'metric': ['euclidean', 'manhattan']}
    busqueda_knn = RandomizedSearchCV(estimator=knn, param_distributions=parametros_knn, n_iter=n,\
                                    scoring='f1_weighted', cv=v_cruzada, verbose=1, random_state=seed, n_jobs=-1)
    
    busqueda_knn.fit(X, y)
    parametros = busqueda_knn.best_params_
    print(f'Los mejores parámetros son: {parametros} con una puntuación de {busqueda_knn.best_score_}')

    modelo = KNeighborsClassifier(**parametros)
    scores = cross_val_score(modelo, X, y, cv=v_cruzada, scoring='f1_weighted')
    return scores.mean()



In [6]:
import tensorflow as tf

# Check if TensorFlow can detect the GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(f"Available GPUs: {gpus}")
else:
    print("No GPU detected.")

No GPU detected.


In [3]:
v_cruzada = 5
seed = 34
n = 30
df = pd.read_csv('df.csv', index_col=0).sample(n=50000, random_state=seed)
df_over = pd.read_csv('df_over.csv', index_col=0).sample(n=50000, random_state=seed)

def read_pickle(path):
    with open(path, 'rb') as file:
        columns = pickle.load(file)
    return columns

def hyperparameter_tuning_general(path):
    df_rdx = df[read_pickle(path)]
    X, y = df_rdx.drop('label', axis=1), df_rdx['label']
    print(regresionLogistica(X, y, n, v_cruzada, seed))
    print(SGDC(X, y, n, v_cruzada, seed))
    print(bosqueAleatorio(X, y, n, v_cruzada, seed))
    print(arbolDecision(X, y, n, v_cruzada, seed))
    print(redNeuronal(X, y, n, v_cruzada, seed))
    print(KNN(X, y, n, v_cruzada, seed))

In [None]:
def hyperparameter_tuning_general(path):
    df_rdx = df[read_pickle(path)]
    X, y = df_rdx.drop('label', axis=1), df_rdx['label']
    # print(regresionLogistica(X, y, n, v_cruzada, seed))
    # print(SGDC(X, y, n, v_cruzada, seed))
    # print(bosqueAleatorio(X, y, n, v_cruzada, seed))
    # print(arbolDecision(X, y, n, v_cruzada, seed))
    print(redNeuronal(X, y, n, v_cruzada, seed))
    # print(KNN(X, y, n, v_cruzada, seed))
              
hyperparameter_tuning_general('df1_columns.pkl')
# Fitting 5 folds for each of 30 candidates, totalling 150 fits
# Los mejores parámetros son: {'solver': 'liblinear', 'penalty': 'l2', 'C': np.float64(1438.44988828766)} con una puntuación de 0.8836523703898143
# 0.8836523703898143
# Fitting 5 folds for each of 30 candidates, totalling 150 fits
# Los mejores parámetros son: {'tol': 0.001, 'max_iter': 100, 'loss': 'hinge', 'alpha': np.float64(1e-05)} con una puntuación de 0.8830713771651768
# 0.8705891820233645
# Fitting 5 folds for each of 30 candidates, totalling 150 fits
# c:\Users\\AppData\Local\Programs\Python\Python312\Lib\site-packages\numpy\ma\core.py:2846: RuntimeWarning: invalid value encountered in cast
#   _data = np.array(data, dtype=dtype, copy=copy,
# Los mejores parámetros son: {'n_estimators': np.int64(100), 'min_samples_split': np.int64(15), 'max_features': 'log2', 'max_depth': np.int64(15)} con una puntuación de 0.8822279091007633
# 0.8802700901684221
# Fitting 5 folds for each of 30 candidates, totalling 150 fits
# Los mejores parámetros son: {'min_samples_split': np.int64(18), 'max_depth': np.int64(22), 'criterion': 'gini'} con una puntuación de 0.8297647106742028
# 0.8292204390533506
# WARNING:tensorflow:From c:\Users\\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\backend\tensorflow\core.py:204: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

# arreglar tuning, ventaja a quien converge mas rapido

# Fitting 5 folds for each of 20 candidates, totalling 100 fits
# c:\Users\\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_search.py:320: UserWarning: The total space of parameters 20 is smaller than n_iter=30. Running 20 iterations. For exhaustive searches, use GridSearchCV.
#   warnings.warn(
# Los mejores parámetros son: {'weights': 'uniform', 'n_neighbors': np.int64(13), 'metric': 'manhattan'} con una puntuación de 0.8381402494954733
# 0.8381402494954733

In [None]:
def hyperparameter_tuning_general(path):
    df_rdx = df[read_pickle(path)]
    X, y = df_rdx.drop('label', axis=1), df_rdx['label']
    print(regresionLogistica(X, y, n, v_cruzada, seed))
    print(SGDC(X, y, n, v_cruzada, seed))
    print(bosqueAleatorio(X, y, n, v_cruzada, seed))
    print(arbolDecision(X, y, n, v_cruzada, seed))
    print(redNeuronal(X, y, n, v_cruzada, seed))
    print(KNN(X, y, n, v_cruzada, seed))

In [None]:
hyperparameter_tuning_general('df1_over_columns.pkl')

In [None]:
hyperparameter_tuning_general('df21_columns.pkl')

In [None]:
hyperparameter_tuning_general('df21_over_columns.pkl')

In [None]:
hyperparameter_tuning_general('df22_columns.pkl')

In [None]:
hyperparameter_tuning_general('df22_over_columns.pkl')

In [None]:
modelos = (regresionLogistica, SGDC, bosqueAleatorio, arbolDecision, red_neuronal)
read_pickle('df3_columns.pkl') + read_pickle('df_NN_columns.pkl')


In [None]:
# read_pickle('df3_over_columns.pkl') + read_pickle('df_NN_over_columns.pkl')