In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
import os

# Cargar los datos
df = pd.read_csv('../../data/processed/JNJ_clean.csv')

# Crear las variables de entrenamiento y prueba, asegurándose de que los datos de entrenamiento sean hasta febrero de 2025
train_df = df[df['Date'] <= '2025-02-28']
test_df = df[df['Date'] >= '2025-03-01']

# Características (X) y objetivo (y)
X_train = train_df.drop(columns=['target', 'Close', 'Open', 'High','Low', 'Date'])
y_train = train_df['target']

X_test = test_df.drop(columns=['target', 'Close', 'Open', 'High','Low','Date'])
y_test = test_df['target']

# Función para entrenar y evaluar modelos
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, is_classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{model_name} Accuracy: {accuracy:.4f}')
        print(f'Confusion Matrix for {model_name}:\n {confusion_matrix(y_test, y_pred)}')
        print(f'Classification Report for {model_name}:\n {classification_report(y_test, y_pred)}')
    else:
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        print(f'{model_name} RMSE: {rmse:.4f}')
        
    # Guardar el modelo entrenado
    joblib.dump(model, f'../../models/{model_name}.pkl')
    return model

# Ajuste de hiperparámetros y validación cruzada
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    if is_classification:
        scoring = 'accuracy'
    else:
        scoring = 'neg_mean_squared_error'  # Usamos MSE para regresión

    # Uso de validación cruzada para evaluar modelos con diferentes parámetros
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=5, cv=cv, random_state=42, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Modelos base
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42, max_depth=5),  # Regularización añadida
    "RandomForest": RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100),  # Regularización añadida
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),  # Ajustado
    "GradientBoosting": GradientBoostingClassifier(random_state=42, learning_rate=0.1, n_estimators=100),  # Ajustado
    "LightGBM": lgb.LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.05, num_leaves=31),  
    "XGBoost": XGBClassifier(random_state=42, max_depth=5, learning_rate=0.05, n_estimators=100),  # Ajustado
    "CatBoost": CatBoostClassifier(random_state=42, iterations=500, learning_rate=0.1, depth=6, verbose=0)  
}

# Parámetros para la búsqueda de hiperparámetros
param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10, 20], "min_samples_split": [2, 10], "min_samples_leaf": [1, 5]},
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20], "min_samples_split": [2, 10]},
    "AdaBoost": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "GradientBoosting": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "LightGBM": {"num_leaves": [31, 50], "learning_rate": [0.05, 0.1], "n_estimators": [50, 100]},
    "XGBoost": {"max_depth": [3, 5, 10], "learning_rate": [0.01, 0.1], "n_estimators": [100, 200]},
    "CatBoost": {"iterations": [500, 1000], "learning_rate": [0.01, 0.1], "depth": [6, 10, 12]}
}

# Entrenamiento y evaluación de cada modelo
best_model = None
best_score = -np.inf  # Para maximizar el score (por ejemplo, accuracy o Sharpe ratio)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Ajuste de hiperparámetros con validación cruzada
    model_tuned = hyperparameter_tuning(model, X_train, y_train, param_grids[model_name], is_classification=True)
    
    # Evaluar el modelo
    trained_model = train_and_evaluate(model_tuned, X_train, X_test, y_train, y_test, model_name, is_classification=True)
    
    # Comparar rendimiento
    score = accuracy_score(y_test, trained_model.predict(X_test))
    
    print(f"Test Accuracy for {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"Best Model: {best_model} with accuracy {best_score:.4f}")


Training DecisionTree...
Best parameters for DecisionTreeClassifier: {'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 20}
DecisionTree Accuracy: 0.3462
Confusion Matrix for DecisionTree:
 [[ 2 10]
 [ 7  7]]
Classification Report for DecisionTree:
               precision    recall  f1-score   support

           0       0.22      0.17      0.19        12
           1       0.41      0.50      0.45        14

    accuracy                           0.35        26
   macro avg       0.32      0.33      0.32        26
weighted avg       0.32      0.35      0.33        26

Test Accuracy for DecisionTree: 0.3462
Training RandomForest...
Best parameters for RandomForestClassifier: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 5}
RandomForest Accuracy: 0.5385
Confusion Matrix for RandomForest:
 [[ 3  9]
 [ 3 11]]
Classification Report for RandomForest:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33        12
       



Best parameters for CatBoostClassifier: {'learning_rate': 0.01, 'iterations': 500, 'depth': 12}
CatBoost Accuracy: 0.4615
Confusion Matrix for CatBoost:
 [[7 5]
 [9 5]]
Classification Report for CatBoost:
               precision    recall  f1-score   support

           0       0.44      0.58      0.50        12
           1       0.50      0.36      0.42        14

    accuracy                           0.46        26
   macro avg       0.47      0.47      0.46        26
weighted avg       0.47      0.46      0.46        26

Test Accuracy for CatBoost: 0.4615
Best Model: LGBMClassifier(num_leaves=50, random_state=42) with accuracy 0.5769


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
import os

# Cargar los datos
df = pd.read_csv('../../data/processed/JNJ_clean.csv')


# Crear las variables de entrenamiento y prueba, asegurándose de que los datos de entrenamiento sean hasta febrero de 2025
train_df = df[df['Date'] <= '2025-02-28']
test_df = df[df['Date'] >= '2025-03-01']

# Características (X) y objetivo (y)
X_train = train_df.drop(columns=['target', 'Close', 'Open', 'Date'])
y_train = train_df['target']

X_test = test_df.drop(columns=['target', 'Close', 'Open', 'Date'])
y_test = test_df['target']


from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV

# Función para ajustar hiperparámetros
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    if is_classification:
        scoring = 'accuracy'
    else:
        scoring = 'neg_mean_squared_error'  # Usamos MSE para regresión

    # Uso de validación cruzada para evaluar modelos con diferentes parámetros
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=cv, random_state=42, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Modelo base con regularización añadida
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10, min_samples_leaf=4),
    "RandomForest": RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100, min_samples_split=10),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier(random_state=42, learning_rate=0.05, n_estimators=100, max_depth=5),
    "LightGBM": lgb.LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.05, num_leaves=31),
    "XGBoost": XGBClassifier(random_state=42, max_depth=5, learning_rate=0.05, n_estimators=100),
    "CatBoost": CatBoostClassifier(random_state=42, iterations=500, learning_rate=0.1, depth=6, verbose=0)
}

# Parámetros de búsqueda de hiperparámetros más ajustados
param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10], "min_samples_split": [10], "min_samples_leaf": [4]},
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [5, 10], "min_samples_split": [10]},
    "AdaBoost": {"n_estimators": [50, 100], "learning_rate": [0.1, 0.05]},
    "GradientBoosting": {"n_estimators": [50, 100], "learning_rate": [0.05, 0.1], "max_depth": [3, 5]},
    "LightGBM": {"num_leaves": [31, 50], "learning_rate": [0.05], "n_estimators": [50, 100]},
    "XGBoost": {"max_depth": [3, 5], "learning_rate": [0.05], "n_estimators": [100]},
    "CatBoost": {"iterations": [500], "learning_rate": [0.05, 0.1], "depth": [6, 10]}
}

# Ajuste de hiperparámetros y evaluación
best_model = None
best_score = -np.inf  # Para maximizar el score (por ejemplo, accuracy)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Ajuste de hiperparámetros con validación cruzada
    model_tuned = hyperparameter_tuning(model, X_train, y_train, param_grids[model_name], is_classification=True)
    
    # Evaluar el modelo con validación cruzada
    trained_model = train_and_evaluate(model_tuned, X_train, y_train, X_test, y_test, model_name, is_classification=True)
    
    # Comparar rendimiento
    score = accuracy_score(y_test, trained_model.predict(X_test))
    
    print(f"Test Accuracy for {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"Best Model: {best_model} with accuracy {best_score:.4f}")


Training DecisionTree...




Best parameters for DecisionTreeClassifier: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10}


ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.