In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
import os

# Cargar los datos
df = pd.read_csv('../../data/processed/JNJ_clean.csv')

# Crear las variables de entrenamiento y prueba, asegurándose de que los datos de entrenamiento sean hasta febrero de 2025
train_df = df[df['Date'] <= '2025-02-28']
test_df = df[df['Date'] >= '2025-03-01']

# Características (X) y objetivo (y)
X_train = train_df.drop(columns=['target', 'Close', 'Open', 'High','Low', 'Date'])
y_train = train_df['target']

X_test = test_df.drop(columns=['target', 'Close', 'Open', 'High','Low','Date'])
y_test = test_df['target']

In [15]:
#Modelo general
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, is_classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        
        print(f'{model_name} Accuracy: {accuracy:.4f}')
        print(f'{model_name} Precision (Macro): {precision:.4f}')
        print(f'{model_name} Recall (Macro): {recall:.4f}')
        print(f'Confusion Matrix for {model_name}:\n {confusion_matrix(y_test, y_pred)}')
        print(f'Classification Report for {model_name}:\n {classification_report(y_test, y_pred)}')
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            feature_names = X_train.columns  # Assuming X_train is a DataFrame with feature names
            print(f'Feature Importance for {model_name}:')
            for feature, importance in zip(feature_names, feature_importances):
                print(f'{feature}: {importance:.4f}')
        else:
            print(f'{model_name} does not have feature importance available.')
        
    else:
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        print(f'{model_name} RMSE: {rmse:.4f}')
        
    # Save the trained model
    joblib.dump(model, f'../../models/{model_name}.pkl')
    return model

# Hyperparameter tuning and cross-validation
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    if is_classification:
        scoring = 'accuracy'
    else:
        scoring = 'neg_mean_squared_error'  # Use MSE for regression

    # Cross-validation to evaluate models with different parameters
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=5, cv=cv, random_state=42, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Base models
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42, max_depth=5),
    "RandomForest": RandomForestClassifier(random_state=42, max_depth=10, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier(random_state=42, learning_rate=0.1, n_estimators=100),
    "LightGBM": lgb.LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.05, num_leaves=31),
    "XGBoost": XGBClassifier(random_state=42, max_depth=5, learning_rate=0.05, n_estimators=100),
    "CatBoost": CatBoostClassifier(random_state=42, iterations=500, learning_rate=0.1, depth=6, verbose=0)
}

# Hyperparameter grids for tuning
param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10, 20], "min_samples_split": [2, 10], "min_samples_leaf": [1, 5]},
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20], "min_samples_split": [2, 10]},
    "AdaBoost": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "GradientBoosting": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1, 1]},
    "LightGBM": {"num_leaves": [31, 50], "learning_rate": [0.05, 0.1], "n_estimators": [50, 100]},
    "XGBoost": {"max_depth": [3, 5, 10], "learning_rate": [0.01, 0.1], "n_estimators": [100, 200]},
    "CatBoost": {"iterations": [500, 1000], "learning_rate": [0.01, 0.1], "depth": [6, 10, 12]}
}

# Training and evaluation of each model
best_model = None
best_score = -np.inf  # To maximize the score (e.g., accuracy or Sharpe ratio)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Hyperparameter tuning with cross-validation
    model_tuned = hyperparameter_tuning(model, X_train, y_train, param_grids[model_name], is_classification=True)
    
    # Evaluate the model
    trained_model = train_and_evaluate(model_tuned, X_train, X_test, y_train, y_test, model_name, is_classification=True)
    
    # Compare performance
    score = accuracy_score(y_test, trained_model.predict(X_test))
    
    print(f"Test Accuracy for {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"Best Model: {best_model} with accuracy {best_score:.4f}")


Training DecisionTree...
Best parameters for DecisionTreeClassifier: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5}
DecisionTree Accuracy: 0.5185
DecisionTree Precision (Macro): 0.5278
DecisionTree Recall (Macro): 0.5247
Confusion Matrix for DecisionTree:
 [[9 4]
 [9 5]]
Classification Report for DecisionTree:
               precision    recall  f1-score   support

           0       0.50      0.69      0.58        13
           1       0.56      0.36      0.43        14

    accuracy                           0.52        27
   macro avg       0.53      0.52      0.51        27
weighted avg       0.53      0.52      0.51        27

Feature Importance for DecisionTree:
log_vol: 0.0907
year: 0.0000
month: 0.0568
day: 0.0463
day_of_week: 0.0490
is_month_end: 0.0000
price_diff: 0.0416
pct_diff: 0.0556
return_daily: 0.0000
return_lag_1: 0.0729
return_lag_2: 0.0677
return_lag_3: 0.0820
return_lag_4: 0.0000
return_lag_5: 0.0392
sma_5: 0.0000
rolling_std_return_5: 0.0425
RSI_



Best parameters for CatBoostClassifier: {'learning_rate': 0.01, 'iterations': 500, 'depth': 6}
CatBoost Accuracy: 0.4074
CatBoost Precision (Macro): 0.3980
CatBoost Recall (Macro): 0.4148
Confusion Matrix for CatBoost:
 [[ 8  5]
 [11  3]]
Classification Report for CatBoost:
               precision    recall  f1-score   support

           0       0.42      0.62      0.50        13
           1       0.38      0.21      0.27        14

    accuracy                           0.41        27
   macro avg       0.40      0.41      0.39        27
weighted avg       0.40      0.41      0.38        27

Feature Importance for CatBoost:
log_vol: 5.1056
year: 2.0623
month: 4.7058
day: 3.7564
day_of_week: 4.2465
is_month_end: 0.1351
price_diff: 3.5121
pct_diff: 3.5265
return_daily: 3.3215
return_lag_1: 4.5288
return_lag_2: 4.6594
return_lag_3: 6.4889
return_lag_4: 4.0520
return_lag_5: 3.7558
sma_5: 3.9304
rolling_std_return_5: 6.3134
RSI_5: 4.4257
MACD: 3.9119
MACD_signal: 4.4378
bb_middle: 4.162

In [22]:
#Modelo tuneado
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Función de entrenamiento y evaluación
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, is_classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if is_classification:
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        
        print(f'{model_name} Accuracy: {accuracy:.4f}')
        print(f'{model_name} Precision (Macro): {precision:.4f}')
        print(f'{model_name} Recall (Macro): {recall:.4f}')
        print(f'Matriz de Confusión para {model_name}:\n{confusion_matrix(y_test, y_pred)}')
        print(f'Reporte de Clasificación para {model_name}:\n{classification_report(y_test, y_pred)}')
        
        # Importancia de características (si está disponible)
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            feature_names = X_train.columns  # Asumiendo que X_train es un DataFrame
            print(f'Importancia de Características para {model_name}:')
            for feature, importance in zip(feature_names, feature_importances):
                print(f'{feature}: {importance:.4f}')
        else:
            print(f'{model_name} no tiene importancias disponibles.')
    else:
        # Ejemplo para regresión
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        print(f'{model_name} RMSE: {rmse:.4f}')
        
    # Guardar el modelo entrenado
    joblib.dump(model, f'../../models/{model_name}.pkl')
    return model

# Función de búsqueda de hiperparámetros y validación cruzada
def hyperparameter_tuning(model, X_train, y_train, param_grid, is_classification=True):
    # Cambia la métrica a 'precision_macro' si es clasificación
    scoring = 'precision_macro' if is_classification else 'neg_mean_squared_error'
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = RandomizedSearchCV(model, 
                                     param_distributions=param_grid, 
                                     n_iter=10,  # Se aumentó para explorar más combinaciones
                                     cv=cv, 
                                     random_state=42, 
                                     n_jobs=-1, 
                                     scoring=scoring)
    grid_search.fit(X_train, y_train)
    
    print(f"Mejores parámetros para {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Definición de modelos base
models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42, verbose=-1),
    "XGBoost": XGBClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

# Rangos de hiperparámetros ampliados
param_grids = {
    "DecisionTree": {
        "max_depth": [3, 5, 10, 20, None], 
        "min_samples_split": [2, 5, 10], 
        "min_samples_leaf": [1, 5, 10]
        },
    "RandomForest": {
        "n_estimators": [100, 300, 500],
        "max_depth": [None, 10, 20],
        "max_features": ['sqrt', 'log2'],
        "min_samples_split": [2, 4, 6]
    },
    "AdaBoost": {
        "n_estimators": [50, 100, 150], 
        "learning_rate": [0.01, 0.1, 1]
        },
    "GradientBoosting": {
        "n_estimators": [50, 100, 150], 
        "learning_rate": [0.01, 0.05, 0.1], 
        "max_depth": [3, 5, 10]
        },
    "LightGBM": {
        "n_estimators": [50, 100, 200, 500],
        "learning_rate": [0.001, 0.01, 0.05, 0.1],
        "num_leaves": [31, 50, 70, 100],
        "min_child_samples": [5, 10, 20],
        "max_depth": [-1, 3, 5, 7, 10],
        "feature_fraction": [0.7, 0.8, 0.9, 1.0],
        "bagging_fraction": [0.7, 0.8, 0.9, 1.0],
        "bagging_freq": [0, 1, 5],
        "min_gain_to_split": [0, 0.01, 0.1],
        "lambda_l1": [0, 0.1, 1],
        "lambda_l2": [0, 0.1, 1]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 500],
        "learning_rate": [0.001, 0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.5, 0.7, 1.0],
        "colsample_bytree": [0.5, 0.7, 1.0]
    },
    "CatBoost": {
        "iterations": [500, 1000, 2000],
        "learning_rate": [0.001, 0.01, 0.05, 0.1],
        "depth": [4, 6, 8, 10]
    }
}

# Ciclo de entrenamiento y evaluación
best_model = None
best_score = -np.inf  # Se maximiza la métrica (precisión macro)

for model_name, model in models.items():
    print(f"\nEntrenando {model_name}...")
    
    # Ajuste de hiperparámetros usando validación cruzada
    model_tuned = hyperparameter_tuning(model, X_train, y_train, param_grids[model_name], is_classification=True)
    
    # Entrenamiento y evaluación del modelo
    trained_model = train_and_evaluate(model_tuned, X_train, X_test, y_train, y_test, model_name, is_classification=True)
    
    # Comparación basada en la métrica de precisión macro
    score = precision_score(y_test, trained_model.predict(X_test), average='macro')
    print(f"Precisión (Macro) en Test para {model_name}: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_model = trained_model

print(f"\nMejor Modelo: {best_model} con precisión (macro) {best_score:.4f}")



Entrenando DecisionTree...
Mejores parámetros para DecisionTreeClassifier: {'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 5}
DecisionTree Accuracy: 0.4815
DecisionTree Precision (Macro): 0.4853
DecisionTree Recall (Macro): 0.4863
Matriz de Confusión para DecisionTree:
[[8 5]
 [9 5]]
Reporte de Clasificación para DecisionTree:
              precision    recall  f1-score   support

           0       0.47      0.62      0.53        13
           1       0.50      0.36      0.42        14

    accuracy                           0.48        27
   macro avg       0.49      0.49      0.47        27
weighted avg       0.49      0.48      0.47        27

Importancia de Características para DecisionTree:
log_vol: 0.0937
year: 0.0000
month: 0.0272
day: 0.0447
day_of_week: 0.0246
is_month_end: 0.0000
price_diff: 0.0643
pct_diff: 0.0574
return_daily: 0.0216
return_lag_1: 0.0752
return_lag_2: 0.0928
return_lag_3: 0.0846
return_lag_4: 0.0000
return_lag_5: 0.0526
sma_5: 0.0000
rolling_



Mejores parámetros para AdaBoostClassifier: {'n_estimators': 50, 'learning_rate': 1}
AdaBoost Accuracy: 0.5185
AdaBoost Precision (Macro): 0.5167
AdaBoost Recall (Macro): 0.5165
Matriz de Confusión para AdaBoost:
[[6 7]
 [6 8]]
Reporte de Clasificación para AdaBoost:
              precision    recall  f1-score   support

           0       0.50      0.46      0.48        13
           1       0.53      0.57      0.55        14

    accuracy                           0.52        27
   macro avg       0.52      0.52      0.52        27
weighted avg       0.52      0.52      0.52        27

Importancia de Características para AdaBoost:
log_vol: 0.0000
year: 0.0000
month: 0.0317
day: 0.0338
day_of_week: 0.0000
is_month_end: 0.0000
price_diff: 0.0000
pct_diff: 0.0000
return_daily: 0.0000
return_lag_1: 0.0000
return_lag_2: 0.0213
return_lag_3: 0.0494
return_lag_4: 0.0000
return_lag_5: 0.0000
sma_5: 0.0779
rolling_std_return_5: 0.3735
RSI_5: 0.0387
MACD: 0.0000
MACD_signal: 0.1288
bb_middle: 