In [1]:
# librerías default
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from datetime import datetime
import time

# modelos

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb # type: ignore

# metricas

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# getión de train-test

from sklearn.model_selection import train_test_split

# transformaciones

from sklearn.preprocessing import MinMaxScaler

In [2]:
dataset = pd.read_csv('VehiculeData_Features.csv')
dataset.head()

Unnamed: 0,Model Year,Make,Model,Electric Range,Electric Vehicle Type
0,2019,4,66,153,0
1,2013,34,82,208,0
2,2018,34,82,249,0
3,2014,4,70,14,1
4,2018,34,83,238,0


In [3]:
dataset.columns

Index(['Model Year', 'Make', 'Model', 'Electric Range',
       'Electric Vehicle Type'],
      dtype='object')

## Selección de Target y Features

In [4]:
X = dataset.drop('Electric Vehicle Type', axis=1) # features = son las variables que me sirven para predecir
y = dataset['Electric Vehicle Type'] # target = lo que voy a predecir

In [5]:
X

Unnamed: 0,Model Year,Make,Model,Electric Range
0,2019,4,66,153
1,2013,34,82,208
2,2018,34,82,249
3,2014,4,70,14
4,2018,34,83,238
...,...,...,...,...
186466,2021,36,107,42
186467,2023,34,84,0
186468,2019,11,54,26
186469,2023,34,84,0


In [6]:
## split para entrenamiento y prueba

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 2025, shuffle=True)

In [7]:
## creamos y aplicamos escalado de datos

scaler = MinMaxScaler()

scaler.fit(X_train) # Calculamos el scaler

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. - Construcción de Modelos

#### a) Naive Bayes

In [8]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_scaled, y_train)

nb_predicts = nb_classifier.predict(X_test_scaled)

nb_classifier_acc = roc_auc_score(nb_predicts, y_test)
print('Roc_Auc_Score:', nb_classifier_acc)

Roc_Auc_Score: 0.8665487786410436


#### b) LDA

In [9]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled, y_train)

lda_predicts = lda.predict(X_test_scaled)

lda_acc = roc_auc_score(y_test, lda_predicts)
print('LDA Roc_Auc_Score:', lda_acc)

LDA Roc_Auc_Score: 0.7267451538564371


#### c) Regresión Logística

In [10]:
logit = LogisticRegression()
logit.fit(X_train_scaled, y_train) # Entrenamos modelo

logit_predicts = logit.predict(X_test_scaled)

logit_acc = roc_auc_score(logit_predicts, y_test)
print('Roc_Auc_Score:', logit_acc)

Roc_Auc_Score: 0.8162519893520679


#### d) SVM

In [11]:
svm = SVC()
svm.fit(X_train_scaled, y_train)

svm_predicts = svm.predict(X_test_scaled)

svm_acc = roc_auc_score(y_test, svm_predicts)
print('SVM Roc_Auc_Score:', svm_acc)

SVM Roc_Auc_Score: 0.9978465531770074


#### e) Árboles de decisión

In [12]:
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

dt_predicts = dt.predict(X_test_scaled)

dt_acc = roc_auc_score(y_test, dt_predicts)
print('Decision Tree Roc_Auc_Score:', dt_acc)

Decision Tree Roc_Auc_Score: 0.9999885646326959


#### f) RandomForest

In [13]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)

rf_predicts = rf_classifier.predict(X_test_scaled)

rf_acc = roc_auc_score(rf_predicts, y_test)
print('Roc_Auc_Score:', rf_acc)

Roc_Auc_Score: 0.9999590801211229


#### g) Análisis de discriminante cuadrático

In [14]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_scaled, y_train)

qda_predicts = qda.predict(X_test_scaled)

qda_acc = roc_auc_score(y_test, qda_predicts)
print('QDA Roc_Auc_Score:', qda_acc)

QDA Roc_Auc_Score: 0.8724418639703364


#### h) AdaBoost 

In [15]:
ada = AdaBoostClassifier()
ada.fit(X_train_scaled, y_train)

ada_predicts = ada.predict(X_test_scaled)

ada_acc = roc_auc_score(y_test, ada_predicts)
print('AdaBoost Roc_Auc_Score:', ada_acc)

AdaBoost Roc_Auc_Score: 0.9999476414046717


#### i) Gradient Boosting 

In [16]:
gb = GradientBoostingClassifier()
gb.fit(X_train_scaled, y_train)

gb_predicts = gb.predict(X_test_scaled)

gb_acc = roc_auc_score(y_test, gb_predicts)
print('Gradient Boosting Roc_Auc_Score:', gb_acc)

Gradient Boosting Roc_Auc_Score: 0.9999885646326959


#### j) XGBoost 

In [17]:
xgboost = xgb.XGBClassifier()
xgboost.fit(X_train_scaled, y_train)

xgboost_predicts = xgboost.predict(X_test_scaled)

xgboost_acc = roc_auc_score(y_test, xgboost_predicts)
print('XGBoost Roc_Auc_Score:', xgboost_acc)

XGBoost Roc_Auc_Score: 0.9999885646326959


#### k) LGBM 

In [18]:
lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train_scaled, y_train)

lgbm_predicts = lgbm.predict(X_test_scaled)

lgbm_acc = roc_auc_score(y_test, lgbm_predicts)
print('LGBM Roc_Auc_Score:', lgbm_acc)

[LightGBM] [Info] Number of positive: 28232, number of negative: 102297
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 291
[LightGBM] [Info] Number of data points in the train set: 130529, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216289 -> initscore=-1.287424
[LightGBM] [Info] Start training from score -1.287424
LGBM Roc_Auc_Score: 0.9999885646326959


# 2. - Optimización de hyper - parámetros

In [19]:
from sklearn.model_selection import GridSearchCV

#### b) LDA

In [20]:
# hyper-parámetros
lda_params = {'solver': ['svd', 'lsqr', 'eigen']}

# configuración de optimización de hyper-parámetros
lda_grid = GridSearchCV(lda, lda_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
lda_grid.fit(X_train_scaled, y_train)


print('LDA Best Score:', lda_grid.best_score_)
print('LDA Best Params:',lda_grid.best_params_)

# Resultados de optimización de hyper-parámetros

lda_results = pd.DataFrame(lda_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
lda_results

LDA Best Score: 0.8605542006951407
LDA Best Params: {'solver': 'svd'}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
0,1,{'solver': 'svd'},0.860554,0.004474
1,1,{'solver': 'lsqr'},0.860554,0.004474
2,1,{'solver': 'eigen'},0.860554,0.004474


#### c) Regresión Logística

In [21]:
# hyper-parámetros
lr_params = {'C': [0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear']}

# configuración de optimización de hyper-parámetros
lr_grid = GridSearchCV(logit, lr_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
lr_grid.fit(X_train_scaled, y_train)


print('Regresión Logística Best Score:', lr_grid.best_score_)
print('Regresión Logística Best Parameters:', lr_grid.best_params_)
lr_results = pd.DataFrame(lr_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
lr_results

Regresión Logística Best Score: 0.8629295988957022
Regresión Logística Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
1,1,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.86293,0.004162
0,2,"{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.862705,0.004043
2,3,"{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.862001,0.003974
3,4,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.86197,0.003983
7,5,"{'C': 100, 'penalty': 'l2', 'solver': 'libline...",0.861936,0.00394
4,6,"{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}",0.861934,0.00394
5,7,"{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}",0.861931,0.003944
6,8,"{'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}",0.86193,0.003941


#### d) SVM

In [22]:
# No se realiza optimización para este modelo debido a los grandes tiempos que lleva ejecutar el codigo
# Estuvo corriendo por más de 30 min y no finalizaba

#### e) Árboles de decisión

In [23]:
# hyper-parámetros
dt_params = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}

# configuración de optimización de hyper-parámetros
dt_grid = GridSearchCV(dt, dt_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
dt_grid.fit(X_train_scaled, y_train)


print('Decision Tree Best Score:', dt_grid.best_score_)
print('Decision Tree Best Parameters:', dt_grid.best_params_)
dt_results = pd.DataFrame(dt_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
dt_results

Decision Tree Best Score: 0.9999950431637943
Decision Tree Best Parameters: {'max_depth': None, 'min_samples_split': 2}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
0,1,"{'max_depth': None, 'min_samples_split': 2}",0.999995,1.5e-05
1,1,"{'max_depth': None, 'min_samples_split': 5}",0.999995,1.5e-05
2,1,"{'max_depth': None, 'min_samples_split': 10}",0.999995,1.5e-05
3,1,"{'max_depth': 10, 'min_samples_split': 2}",0.999995,1.5e-05
4,1,"{'max_depth': 10, 'min_samples_split': 5}",0.999995,1.5e-05
5,1,"{'max_depth': 10, 'min_samples_split': 10}",0.999995,1.5e-05
6,1,"{'max_depth': 20, 'min_samples_split': 2}",0.999995,1.5e-05
7,1,"{'max_depth': 20, 'min_samples_split': 5}",0.999995,1.5e-05
8,1,"{'max_depth': 20, 'min_samples_split': 10}",0.999995,1.5e-05
9,1,"{'max_depth': 30, 'min_samples_split': 2}",0.999995,1.5e-05


#### f) RandomForest

In [24]:
# hyper-parámetros
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}

# configuración de optimización de hyper-parámetros
rf_grid = GridSearchCV(rf_classifier, rf_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
rf_grid.fit(X_train_scaled, y_train)


print('Random Forest Best Score:', rf_grid.best_score_)
print('Random Forest Best Parameters:', rf_grid.best_params_)
rf_results = pd.DataFrame(rf_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
rf_results

Random Forest Best Score: 0.9999999446000831
Random Forest Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
3,1,"{'max_depth': None, 'min_samples_split': 5, 'n...",1.0,6.184528e-08
11,2,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",1.0,6.184528e-08
9,3,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",1.0,6.347199e-08
0,4,"{'max_depth': None, 'min_samples_split': 2, 'n...",1.0,6.533374e-08
1,4,"{'max_depth': None, 'min_samples_split': 2, 'n...",1.0,6.347199e-08
2,4,"{'max_depth': None, 'min_samples_split': 5, 'n...",1.0,6.347199e-08
10,7,"{'max_depth': 20, 'min_samples_split': 5, 'n_e...",1.0,6.394268e-08
5,8,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",1.0,6.69654e-08
4,9,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",1.0,1.06728e-07
8,10,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",1.0,8.821465e-08


#### g) Análisis de discriminante cuadrático

In [25]:
# hyper-parámetros
qda_params = {'reg_param': [0.0, 0.1, 0.2, 0.3]}

# configuración de optimización de hyper-parámetros
qda_grid = GridSearchCV(qda, qda_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
qda_grid.fit(X_train_scaled, y_train)


print('QDA Best Score:', qda_grid.best_score_)
print('QDA Best Parameters:', qda_grid.best_params_)
qda_results = pd.DataFrame(qda_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
qda_results

QDA Best Score: 0.9667428214883016
QDA Best Parameters: {'reg_param': 0.0}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
0,1,{'reg_param': 0.0},0.966743,0.001803
1,2,{'reg_param': 0.1},0.885653,0.002375
2,3,{'reg_param': 0.2},0.86251,0.002438
3,4,{'reg_param': 0.3},0.843165,0.002681


#### h) AdaBoost 

In [26]:
# hyper-parámetros
ada_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.5, 1.0]}

# configuración de optimización de hyper-parámetros
ada_grid = GridSearchCV(ada, ada_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
ada_grid.fit(X_train_scaled, y_train)


print('AdaBoost Best Score:', ada_grid.best_score_)
print('AdaBoost Best Parameters:', ada_grid.best_params_)
ada_results = pd.DataFrame(ada_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
ada_results

AdaBoost Best Score: 0.999999944600083
AdaBoost Best Parameters: {'learning_rate': 1.0, 'n_estimators': 150}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
8,1,"{'learning_rate': 1.0, 'n_estimators': 150}",1.0,6.184528e-08
7,2,"{'learning_rate': 1.0, 'n_estimators': 100}",1.0,6.394268e-08
5,3,"{'learning_rate': 0.5, 'n_estimators': 150}",1.0,7.788984e-08
6,4,"{'learning_rate': 1.0, 'n_estimators': 50}",0.999999,1.590229e-06
4,5,"{'learning_rate': 0.5, 'n_estimators': 100}",0.999999,1.515772e-06
3,6,"{'learning_rate': 0.5, 'n_estimators': 50}",0.999996,2.473584e-06
2,7,"{'learning_rate': 0.1, 'n_estimators': 150}",0.999987,7.639055e-06
1,8,"{'learning_rate': 0.1, 'n_estimators': 100}",0.999971,1.413029e-05
0,9,"{'learning_rate': 0.1, 'n_estimators': 50}",0.999897,3.22458e-05


#### i) Gradient Boosting 

In [27]:
# hyper-parámetros
gb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}

# configuración de optimización de hyper-parámetros
gb_grid = GridSearchCV(gb, gb_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
gb_grid.fit(X_train_scaled, y_train)


print('Gradient Boosting Best Score:', gb_grid.best_score_)
print('Gradient Boosting Best Parameters:', gb_grid.best_params_)
gb_results = pd.DataFrame(gb_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
gb_results

Gradient Boosting Best Score: 0.9999995308118539
Gradient Boosting Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
1,1,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",1.0,1e-06
3,2,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.999999,2e-06
2,3,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.999997,9e-06
10,4,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",0.999994,7e-06
0,5,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.999994,1.1e-05
11,6,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",0.999993,1.3e-05
5,7,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.999992,2.4e-05
4,8,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.999991,2.6e-05
9,9,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.999977,1.4e-05
8,10,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.999937,1e-05


#### j) XGBoost 

In [28]:
# hyper-parámetros
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01]}

# configuración de optimización de hyper-parámetros
xgb_grid = GridSearchCV(xgboost, xgb_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
xgb_grid.fit(X_train_scaled, y_train)


print('XGBoost Best Score:', xgb_grid.best_score_)
print('XGBoost Best Parameters:', xgb_grid.best_params_)
xgb_results = pd.DataFrame(xgb_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
xgb_results

XGBoost Best Score: 0.999999501381217
XGBoost Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
1,1,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",1.0,1e-06
2,2,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.999999,2e-06
3,3,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.999999,3e-06
4,4,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.999999,3e-06
5,5,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.999999,3e-06
0,6,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.999995,5e-06
11,7,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",0.999992,1.9e-05
9,8,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.999991,1.9e-05
10,9,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",0.999991,1.8e-05
8,10,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.999983,1.8e-05


#### k) LGBM 

In [29]:
# hyper-parámetros
lgbm_params = {'num_leaves': [31, 40], 'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200]}

# configuración de optimización de hyper-parámetros
lgbm_grid = GridSearchCV(lgbm, lgbm_params, scoring='roc_auc', cv=10)

# entrenamiento del modelo
lgbm_grid.fit(X_train_scaled, y_train)


print('LGBM Best Score:', lgbm_grid.best_score_)
print('LGBM Best Parameters:', lgbm_grid.best_params_)
lgbm_results = pd.DataFrame(lgbm_grid.cv_results_).sort_values("rank_test_score", ascending=True)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']]
lgbm_results

[LightGBM] [Info] Number of positive: 25409, number of negative: 92067
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 290
[LightGBM] [Info] Number of data points in the train set: 117476, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216291 -> initscore=-1.287413
[LightGBM] [Info] Start training from score -1.287413
[LightGBM] [Info] Number of positive: 25409, number of negative: 92067
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 288
[LightGBM] [Info] Number of data points in the train set: 117476, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216291 -> initscore=-1.287413
[Lig

Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
4,1,"{'learning_rate': 0.01, 'n_estimators': 100, '...",1.0,1.075141e-07
5,2,"{'learning_rate': 0.01, 'n_estimators': 100, '...",1.0,1.075141e-07
6,3,"{'learning_rate': 0.01, 'n_estimators': 200, '...",1.0,3.296855e-07
7,3,"{'learning_rate': 0.01, 'n_estimators': 200, '...",1.0,3.296855e-07
0,5,"{'learning_rate': 0.1, 'n_estimators': 100, 'n...",1.0,9.701968e-07
1,5,"{'learning_rate': 0.1, 'n_estimators': 100, 'n...",1.0,9.701968e-07
3,7,"{'learning_rate': 0.1, 'n_estimators': 200, 'n...",0.999999,2.641538e-06
2,8,"{'learning_rate': 0.1, 'n_estimators': 200, 'n...",0.999999,2.641538e-06


# 3. - Registro de los modelos

In [30]:
# Inicialización de modelos con hiperparámetros específicos
modelos = {
    'LDA': LinearDiscriminantAnalysis(solver='svd'),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(max_depth=None, min_samples_split=2),
    'QDA': QuadraticDiscriminantAnalysis(reg_param=0),
    'AdaBoost': AdaBoostClassifier(n_estimators=150, learning_rate=1.0),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1),
    'XGBoost': xgb.XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1),
    'LGBM': lgb.LGBMClassifier(num_leaves=31, learning_rate=0.1, n_estimators=100),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(C=0.1, penalty='l2', solver='liblinear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=2)
}

# Entrenamiento y evaluación de cada modelo
resultados = []

for name, model in modelos.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = datetime.now()
    elapsed_time = time.time() - start_time
    predictions = model.predict(X_test)
    score = roc_auc_score(y_test, predictions)
    hyperparameters = model.get_params()
    resultados.append({
        'Model': name,
        'Hyperparameters': hyperparameters,
        'ROC AUC Score': score,
        'Training Time (s)': elapsed_time,
        'End Time': end_time
    })

# Creación del DataFrame y ordenamiento
resultados_modelos = pd.DataFrame(resultados)
resultados_modelos = resultados_modelos.sort_values(by='ROC AUC Score', ascending=False)

# Mostrar los resultados ordenados
resultados_modelos

# Generar archivo excel con los resultados
resultados_modelos.to_excel('resultados_modelos.xlsx', index=True)

[LightGBM] [Info] Number of positive: 28232, number of negative: 102297
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 291
[LightGBM] [Info] Number of data points in the train set: 130529, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216289 -> initscore=-1.287424
[LightGBM] [Info] Start training from score -1.287424


In [31]:
resultados_modelos

Unnamed: 0,Model,Hyperparameters,ROC AUC Score,Training Time (s),End Time
2,Decision Tree,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.999989,0.048853,2024-06-30 17:16:45.498383
4,AdaBoost,"{'algorithm': 'SAMME.R', 'base_estimator': 'de...",0.999989,5.223813,2024-06-30 17:16:50.758696
5,Gradient Boosting,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.999989,6.432497,2024-06-30 17:16:57.748430
6,XGBoost,"{'objective': 'binary:logistic', 'base_score':...",0.999989,0.185863,2024-06-30 17:16:58.049386
7,LGBM,"{'boosting_type': 'gbdt', 'class_weight': None...",0.999989,0.144693,2024-06-30 17:16:58.220392
10,Random Forest,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999989,2.324382,2024-06-30 17:17:00.756732
3,QDA,"{'priors': None, 'reg_param': 0, 'store_covari...",0.872442,0.015363,2024-06-30 17:16:45.523294
8,Naive Bayes,"{'priors': None, 'var_smoothing': 1e-09}",0.847359,0.022678,2024-06-30 17:16:58.270293
1,SVM,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.732823,483.230079,2024-06-30 17:14:09.115874
0,LDA,"{'covariance_estimator': None, 'n_components':...",0.726745,0.056631,2024-06-30 17:06:05.873620
