In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import joblib
import os

In [None]:
# Cargar datos procesados
df = pd.read_csv("../../data/processed/SPLV_clean.csv", parse_dates=["Date"])

In [45]:
# Dividir en entrenamiento y prueba por fecha
train_df = df[df['Date'] <= '2025-02-28']
test_df = df[df['Date'] >= '2025-03-01']

In [46]:
# Features y target
X_train = train_df.drop(columns=['target', 'Close', 'Open', 'High', 'Low', 'Date'])
y_train = train_df['target']
X_test = test_df.drop(columns=['target', 'Close', 'Open', 'High', 'Low', 'Date'])
y_test = test_df['target']

In [47]:
from collections import Counter

# Calcular ratio de clases
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1]
print("scale_pos_weight para XGBoost:", scale_pos_weight)

scale_pos_weight para XGBoost: 0.8726937269372693


In [48]:
# Crear carpeta si no existe
os.makedirs("../../models", exist_ok=True)

# Modelos base
models = {
    "DecisionTree": DecisionTreeClassifier(
        random_state=42,
        max_depth=5,
        class_weight='balanced'
    ),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        max_depth=10,
        n_estimators=100,
        class_weight='balanced'
    ),
    "AdaBoost": AdaBoostClassifier(
        random_state=42,
        n_estimators=100
    ),
    "GradientBoosting": GradientBoostingClassifier(
        random_state=42,
        learning_rate=0.05,
        n_estimators=100
    ),
    "LightGBM": lgb.LGBMClassifier(
    random_state=42,
    learning_rate=0.03,            # Más bajo para evitar sobreajuste
    n_estimators=200,              # Más árboles = mejor generalización
    num_leaves=15,                 # Reduce complejidad
    min_child_samples=10,          # Previene overfitting
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    importance_type='gain'
),
    "XGBoost": XGBClassifier(
        random_state=42,
        max_depth=5,
        learning_rate=0.05,
        n_estimators=150,
        scale_pos_weight=scale_pos_weight * 0.5,  # ⚠️ suaviza penalización
        use_label_encoder=False,
        eval_metric='aucpr',  # mejor para clases desbalanceadas
        subsample=0.8,
        colsample_bytree=0.8
)
}

# Función para entrenar y evaluar
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, threshold=0.55):
    model.fit(X_train, y_train)

    # Predicción con threshold si existe predict_proba
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        y_pred = (y_proba > threshold).astype(int)
    else:
        y_pred = model.predict(X_test)

    # Evaluaciones
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"\n===== {model_name} (Threshold: {threshold}) =====")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro): {rec:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Mostrar importancia de variables si el modelo la tiene
    if hasattr(model, 'feature_importances_'):
        print("Feature Importances:")
        for name, val in zip(X_train.columns, model.feature_importances_):
            print(f"{name}: {val:.4f}")

    # Guardar modelo
    joblib.dump(model, f"../../models/SPLV_{model_name}.pkl")

In [49]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'learning_rate': [0.01, 0.03, 0.05],
    'n_estimators': [150, 200],
    'num_leaves': [5, 10],
    'min_child_samples': [15, 20],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

search = RandomizedSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_grid,
    n_iter=10,
    scoring='f1_macro',
    cv=3,
    verbose=1,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_
models["LightGBM"] = best_model  # Reemplaza el modelo por el mejor encontrado

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 361, number of negative: 315
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5647
[LightGBM] [Info] Number of data points in the train set: 676, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 361, number of negative: 316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5652
[LightGBM] [Info] Number of data points in the train set: 677, number of used features: 33
[LightGBM] [Info] [binary:BoostFr

In [50]:
# Entrenamiento de todos los modelos
for model_name, model in models.items():
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, threshold=0.55)


===== DecisionTree (Threshold: 0.55) =====
Accuracy: 0.6429
Precision (macro): 0.6257
Recall (macro): 0.6257
F1 Score (macro): 0.6257
Confusion Matrix:
[[12  5]
 [ 5  6]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.71      0.71        17
           1       0.55      0.55      0.55        11

    accuracy                           0.64        28
   macro avg       0.63      0.63      0.63        28
weighted avg       0.64      0.64      0.64        28

Feature Importances:
day_of_week: 0.0429
month: 0.0000
is_month_end: 0.0000
price_diff: 0.0000
pct_diff: 0.0000
return_daily: 0.0557
return_lag_1: 0.0000
return_lag_2: 0.0000
return_lag_3: 0.0215
return_lag_4: 0.0000
return_lag_5: 0.0517
sma_5: 0.1026
sma_10: 0.0000
rolling_std_return_5: 0.0629
RSI_5: 0.0985
MACD: 0.0000
MACD_signal: 0.0000
bb_middle: 0.0000
bb_upper: 0.0000
bb_lower: 0.0000
volume_outlier: 0.0000
price_above_SMA50: 0.0000
RSI_overbought: 0.0000
MACD_above_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== AdaBoost (Threshold: 0.55) =====
Accuracy: 0.6071
Precision (macro): 0.3036
Recall (macro): 0.5000
F1 Score (macro): 0.3778
Confusion Matrix:
[[17  0]
 [11  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        17
           1       0.00      0.00      0.00        11

    accuracy                           0.61        28
   macro avg       0.30      0.50      0.38        28
weighted avg       0.37      0.61      0.46        28

Feature Importances:
day_of_week: 0.0000
month: 0.0000
is_month_end: 0.0000
price_diff: 0.0000
pct_diff: 0.0000
return_daily: 0.1083
return_lag_1: 0.0000
return_lag_2: 0.0000
return_lag_3: 0.0000
return_lag_4: 0.0000
return_lag_5: 0.0000
sma_5: 0.0000
sma_10: 0.3515
rolling_std_return_5: 0.0364
RSI_5: 0.0000
MACD: 0.0900
MACD_signal: 0.0193
bb_middle: 0.0000
bb_upper: 0.0000
bb_lower: 0.0000
volume_outlier: 0.0000
price_above_SMA50: 0.0000
RSI_overbought: 0.0000
MACD_above_sign

Parameters: { "use_label_encoder" } are not used.




===== XGBoost (Threshold: 0.55) =====
Accuracy: 0.6786
Precision (macro): 0.7083
Recall (macro): 0.6070
F1 Score (macro): 0.5902
Confusion Matrix:
[[16  1]
 [ 8  3]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.94      0.78        17
           1       0.75      0.27      0.40        11

    accuracy                           0.68        28
   macro avg       0.71      0.61      0.59        28
weighted avg       0.70      0.68      0.63        28

Feature Importances:
day_of_week: 0.0263
month: 0.0271
is_month_end: 0.0445
price_diff: 0.0281
pct_diff: 0.0292
return_daily: 0.0278
return_lag_1: 0.0300
return_lag_2: 0.0300
return_lag_3: 0.0298
return_lag_4: 0.0270
return_lag_5: 0.0316
sma_5: 0.0237
sma_10: 0.0271
rolling_std_return_5: 0.0319
RSI_5: 0.0321
MACD: 0.0327
MACD_signal: 0.0273
bb_middle: 0.0300
bb_upper: 0.0297
bb_lower: 0.0320
volume_outlier: 0.0166
price_above_SMA50: 0.0321
RSI_overbought: 0.0506
MACD_above_signa