In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
import os

# Cargar los datos
df = pd.read_csv('../../data/processed/JNJ_clean.csv')

# Crear las variables de entrenamiento y prueba, asegurándose de que los datos de entrenamiento sean hasta febrero de 2025
train_df = df[df['Date'] <= '2025-02-28']
test_df = df[df['Date'] >= '2025-03-01']

# Características (X) y objetivo (y)
X_train = train_df.drop(columns=['target', 'Close', 'Open', 'High','Low', 'Date'])
y_train = train_df['target']

X_test = test_df.drop(columns=['target', 'Close', 'Open', 'High','Low','Date'])
y_test = test_df['target']



In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, is_classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        
        print(f'{model_name} Accuracy: {accuracy:.4f}')
        print(f'{model_name} Precision (Macro): {precision:.4f}')
        print(f'{model_name} Recall (Macro): {recall:.4f}')
        print(f'Confusion Matrix for {model_name}:\n {confusion_matrix(y_test, y_pred)}')
        print(f'Classification Report for {model_name}:\n {classification_report(y_test, y_pred)}')
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            feature_names = X_train.columns  # Assuming X_train is a DataFrame with feature names
            print(f'Feature Importance for {model_name}:')
            for feature, importance in zip(feature_names, feature_importances):
                print(f'{feature}: {importance:.4f}')
        else:
            print(f'{model_name} does not have feature importance available.')
        
    else:
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        print(f'{model_name} RMSE: {rmse:.4f}')
        
    # Save the trained model
    joblib.dump(model, f'../../models/{model_name}.pkl')
    return model

# Hyperparameter tuning and cross-validation
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    if is_classification:
        scoring = 'accuracy'
    else:
        scoring = 'neg_mean_squared_error'  # Use MSE for regression

    # Cross-validation to evaluate models with different parameters
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=5, cv=cv, random_state=42, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Base models
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42, max_depth=5),
    "RandomForest": RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier(random_state=42, learning_rate=0.1, n_estimators=100),
    "LightGBM": lgb.LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.05, num_leaves=31),
    "XGBoost": XGBClassifier(random_state=42, max_depth=5, learning_rate=0.05, n_estimators=100),
    "CatBoost": CatBoostClassifier(random_state=42, iterations=500, learning_rate=0.1, depth=6, verbose=0)
}

# Hyperparameter grids for tuning
param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10, 20], "min_samples_split": [2, 10], "min_samples_leaf": [1, 5]},
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20], "min_samples_split": [2, 10]},
    "AdaBoost": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "GradientBoosting": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "LightGBM": {"num_leaves": [31, 50], "learning_rate": [0.05, 0.1], "n_estimators": [50, 100]},
    "XGBoost": {"max_depth": [3, 5, 10], "learning_rate": [0.01, 0.1], "n_estimators": [100, 200]},
    "CatBoost": {"iterations": [500, 1000], "learning_rate": [0.01, 0.1], "depth": [6, 10, 12]}
}

# Training and evaluation of each model
best_model = None
best_score = -np.inf  # To maximize the score (e.g., accuracy or Sharpe ratio)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Hyperparameter tuning with cross-validation
    model_tuned = hyperparameter_tuning(model, X_train, y_train, param_grids[model_name], is_classification=True)
    
    # Evaluate the model
    trained_model = train_and_evaluate(model_tuned, X_train, X_test, y_train, y_test, model_name, is_classification=True)
    
    # Compare performance
    score = accuracy_score(y_test, trained_model.predict(X_test))
    
    print(f"Test Accuracy for {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"Best Model: {best_model} with accuracy {best_score:.4f}")


Training DecisionTree...
Best parameters for DecisionTreeClassifier: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5}
DecisionTree Accuracy: 0.5185
DecisionTree Precision (Macro): 0.5278
DecisionTree Recall (Macro): 0.5247
Confusion Matrix for DecisionTree:
 [[9 4]
 [9 5]]
Classification Report for DecisionTree:
               precision    recall  f1-score   support

           0       0.50      0.69      0.58        13
           1       0.56      0.36      0.43        14

    accuracy                           0.52        27
   macro avg       0.53      0.52      0.51        27
weighted avg       0.53      0.52      0.51        27

Feature Importance for DecisionTree:
log_vol: 0.0907
year: 0.0000
month: 0.0568
day: 0.0463
day_of_week: 0.0490
is_month_end: 0.0000
price_diff: 0.0416
pct_diff: 0.0556
return_daily: 0.0000
return_lag_1: 0.0729
return_lag_2: 0.0677
return_lag_3: 0.0820
return_lag_4: 0.0000
return_lag_5: 0.0392
sma_5: 0.0000
rolling_std_return_5: 0.0425
RSI_



Best parameters for CatBoostClassifier: {'learning_rate': 0.01, 'iterations': 500, 'depth': 6}
CatBoost Accuracy: 0.3704
CatBoost Precision (Macro): 0.3709
CatBoost Recall (Macro): 0.3709
Confusion Matrix for CatBoost:
 [[5 8]
 [9 5]]
Classification Report for CatBoost:
               precision    recall  f1-score   support

           0       0.36      0.38      0.37        13
           1       0.38      0.36      0.37        14

    accuracy                           0.37        27
   macro avg       0.37      0.37      0.37        27
weighted avg       0.37      0.37      0.37        27

Feature Importance for CatBoost:
log_vol: 5.1804
year: 1.9507
month: 4.3276
day: 3.6005
day_of_week: 4.6322
is_month_end: 0.1524
price_diff: 3.2883
pct_diff: 3.2674
return_daily: 3.8524
return_lag_1: 4.7284
return_lag_2: 5.2827
return_lag_3: 6.7112
return_lag_4: 3.8587
return_lag_5: 3.6280
sma_5: 3.7992
rolling_std_return_5: 6.3288
RSI_5: 4.5226
MACD: 4.4006
MACD_signal: 4.3861
bb_middle: 3.9167
bb

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import numpy as np
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

# Function to perform feature selection (using SelectKBest and/or RFE)
def feature_selection(X_train, y_train, model_name, model, num_features=None):
    print(f"Selecting features for {model_name}...")

    if model_name in ['RandomForest', 'XGBoost', 'LightGBM', 'CatBoost']:  # For models that support feature importance
        # Use SelectFromModel based on feature importance
        model.fit(X_train, y_train)
        selector = SelectFromModel(model, threshold="mean", max_features=num_features)
        X_train_selected = selector.transform(X_train)
        print(f"Selected {X_train_selected.shape[1]} features for {model_name} based on feature importance.")
    else:
        # Use SelectKBest or RFE for other models
        if num_features is None:
            num_features = X_train.shape[1] // 2  # Select half of the features as a default
        selector = SelectKBest(f_classif, k=num_features)
        X_train_selected = selector.fit_transform(X_train, y_train)
        print(f"Selected {X_train_selected.shape[1]} features for {model_name} using SelectKBest.")
        
    return X_train_selected, selector

# Function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, is_classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        
        print(f'{model_name} Accuracy: {accuracy:.4f}')
        print(f'{model_name} Precision (Macro): {precision:.4f}')
        print(f'{model_name} Recall (Macro): {recall:.4f}')
        print(f'Confusion Matrix for {model_name}:\n {confusion_matrix(y_test, y_pred)}')
        print(f'Classification Report for {model_name}:\n {classification_report(y_test, y_pred)}')
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            feature_names = X_train.columns  # Assuming X_train is a DataFrame with feature names
            print(f'Feature Importance for {model_name}:')
            for feature, importance in zip(feature_names, feature_importances):
                print(f'{feature}: {importance:.4f}')
        else:
            print(f'{model_name} does not have feature importance available.')
        
    else:
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        print(f'{model_name} RMSE: {rmse:.4f}')
        
    # Save the trained model
    joblib.dump(model, f'../../models/{model_name}.pkl')
    return model

# Hyperparameter tuning and cross-validation
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    if is_classification:
        scoring = 'accuracy'
    else:
        scoring = 'neg_mean_squared_error'  # Use MSE for regression

    # Cross-validation to evaluate models with different parameters
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=20, cv=cv, random_state=42, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Base models
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

# Hyperparameter grids for tuning
param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10, 20, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 5], "max_features": ['auto', 'sqrt', 'log2']},
    "RandomForest": {"n_estimators": [50, 100, 200, 300, 500], "max_depth": [5, 10, 20, 30, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4, 10], "max_features": ['auto', 'sqrt', 'log2'], "bootstrap": [True, False]},
    "AdaBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 1], "base_estimator": [DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=5)]},
    "GradientBoosting": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.5], "max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2], "subsample": [0.8, 1.0], "max_features": ['auto', 'sqrt', 'log2']},
    "LightGBM": {"num_leaves": [31, 50, 100], "learning_rate": [0.01, 0.05, 0.1], "n_estimators": [50, 100, 200], "max_depth": [5, 10, 15], "boosting_type": ['gbdt', 'dart'], "max_bin": [255, 500]},
    "XGBoost": {"max_depth": [3, 5, 7, 10], "learning_rate": [0.01, 0.05, 0.1], "n_estimators": [50, 100, 200], "subsample": [0.8, 1.0], "colsample_bytree": [0.8, 1.0], "gamma": [0, 0.1, 0.2]},
    "CatBoost": {"iterations": [500, 1000], "learning_rate": [0.01, 0.05, 0.1], "depth": [6, 8, 10], "bagging_temperature": [0, 1, 5], "leaf_estimation_iterations": [1, 5, 10]}
}

# Training and evaluation of each model
best_model = None
best_score = -np.inf  # To maximize the score (e.g., accuracy or Sharpe ratio)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Feature selection (automatically choosing the most relevant features for each model)
    X_train_selected, feature_selector = feature_selection(X_train, y_train, model_name, model, num_features=10)
    
    # Hyperparameter tuning with cross-validation
    model_tuned = hyperparameter_tuning(model, X_train_selected, y_train, param_grids[model_name], is_classification=True)
    
    # Evaluate the model
    trained_model = train_and_evaluate(model_tuned, X_train_selected, X_test, y_train, y_test, model_name, is_classification=True)
    
    # Compare performance
    score = accuracy_score(y_test, trained_model.predict(X_test))
    
    print(f"Test Accuracy for {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"Best Model: {best_model} with accuracy {best_score:.4f}")


Training DecisionTree...
Selecting features for DecisionTree...
Selected 10 features for DecisionTree using SelectKBest.


70 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/bolsa_ml/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/bolsa_ml/lib/python3.10/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/envs/bolsa_ml/lib/python3.10/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/envs/bolsa_ml/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 98,

Best parameters for DecisionTreeClassifier: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 3}


ValueError: X has 28 features, but DecisionTreeClassifier is expecting 10 features as input.