# TradPal - Machine Learning Training Guide

This notebook demonstrates how to train machine learning models for signal enhancement in TradPal.

## Overview
- Load and prepare trading data
- Train machine learning models
- Evaluate model performance
- Integrate models into trading signals

## Requirements
- TradPal with ML dependencies installed
- Jupyter notebook environment
- Sufficient computational resources for ML training

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Add TradPal to path
project_root = os.path.abspath('..')
sys.path.append(project_root)

# Import TradPal ML modules
from src.ml_predictor import LSTMSignalPredictor, is_lstm_available, is_shap_available
from src.ml_ensemble import EnsembleSignalPredictor
from src.data_fetcher import fetch_historical_data
from src.indicators import calculate_indicators
from src.signal_generator import generate_signals

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ Libraries imported successfully")
print(f"📊 Working directory: {os.getcwd()}")
print(f"🔧 LSTM available: {is_lstm_available()}")
print(f"🔧 SHAP available: {is_shap_available()}")

## Step 1: Configure ML Training Parameters

Set up the parameters for machine learning model training.

In [None]:
# ML Training configuration
ML_CONFIG = {
    'symbol': 'BTC/USDT',
    'exchange': 'kraken',
    'timeframe': '1h',
    'start_date': '2023-01-01',
    'end_date': '2024-01-01',  # 1 year of training data
    
    # Data preparation
    'test_size': 0.2,  # 20% for testing
    'validation_size': 0.2,  # 20% of training data for validation
    'random_state': 42,
    
    # Model parameters
    'models_to_train': ['random_forest', 'xgboost', 'lstm'],
    'cv_folds': 5,  # Cross-validation folds
    
    # Feature engineering
    'lookback_periods': [5, 10, 20],  # Periods for feature creation
    'include_price_features': True,
    'include_volume_features': True,
    'include_time_features': True,
    
    # Training parameters
    'early_stopping_patience': 10,
    'max_training_time': 300,  # Max training time in seconds
}

print("⚙️ ML Training configuration:")
for key, value in ML_CONFIG.items():
    print(f"  {key}: {value}")

## Step 2: Prepare Training Data

Fetch historical data and create features for machine learning.

In [None]:
print(f"📥 Fetching training data for {ML_CONFIG['symbol']}...")
print(f"   Period: {ML_CONFIG['start_date']} to {ML_CONFIG['end_date']}")
print(f"   Timeframe: {ML_CONFIG['timeframe']}")

# Fetch historical data
try:
    raw_data = fetch_historical_data(
        symbol=ML_CONFIG['symbol'],
        exchange=ML_CONFIG['exchange'],
        timeframe=ML_CONFIG['timeframe'],
        start_date=ML_CONFIG['start_date'],
        end_date=ML_CONFIG['end_date']
    )
    
    print(f"✅ Training data fetched: {len(raw_data)} candles")
    print(f"   Date range: {raw_data.index[0]} to {raw_data.index[-1]}")
    
except Exception as e:
    print(f"❌ Error fetching training data: {e}")
    raise

In [None]:
print("🔧 Preparing features and labels...")

# Calculate technical indicators
indicator_config = {
    'ema': {'enabled': True, 'periods': [9, 21, 50]},
    'rsi': {'enabled': True, 'period': 14},
    'bb': {'enabled': True, 'period': 20, 'std_dev': 2.0},
    'atr': {'enabled': True, 'period': 14},
    'adx': {'enabled': True, 'period': 14},
    'macd': {'enabled': True}
}

data_with_indicators = calculate_indicators(raw_data, config=indicator_config)
print(f"✅ Indicators calculated: {len(data_with_indicators.columns)} features")

# Generate signals (these will be our labels)
data_with_signals = generate_signals(data_with_indicators)
print("✅ Signals generated for labels")

# Create target variable (1 for buy, -1 for sell, 0 for hold)
data_with_signals['target'] = 0
data_with_signals.loc[data_with_signals['Buy_Signal'] == 1, 'target'] = 1
data_with_signals.loc[data_with_signals['Sell_Signal'] == 1, 'target'] = -1

# Remove rows with NaN values
data_clean = data_with_signals.dropna()
print(f"✅ Data cleaned: {len(data_clean)} samples")

# Show class distribution
class_counts = data_clean['target'].value_counts().sort_index()
print(f"\n📊 Class distribution:")
print(f"   Hold (0): {class_counts.get(0, 0)} samples ({class_counts.get(0, 0)/len(data_clean)*100:.1f}%)")
print(f"   Buy (1): {class_counts.get(1, 0)} samples ({class_counts.get(1, 0)/len(data_clean)*100:.1f}%)")
print(f"   Sell (-1): {class_counts.get(-1, 0)} samples ({class_counts.get(-1, 0)/len(data_clean)*100:.1f}%)")

## Step 3: Feature Engineering

Create additional features for better model performance.

In [None]:
print("🔧 Creating additional features...")

# Create lagged features
def create_lagged_features(df, columns, lags=[1, 2, 3]):
    """Create lagged versions of specified columns."""
    for col in columns:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

# Price-based features
price_columns = ['close', 'high', 'low', 'volume']
data_featured = create_lagged_features(data_clean.copy(), price_columns, lags=[1, 2, 5])

# Technical indicator features
indicator_columns = ['EMA9', 'EMA21', 'EMA50', 'RSI', 'BB_middle', 'BB_upper', 'BB_lower', 'ATR', 'ADX']
data_featured = create_lagged_features(data_featured, indicator_columns, lags=[1, 2])

# Create trend features
data_featured['ema_trend'] = (data_featured['EMA9'] > data_featured['EMA21']).astype(int)
data_featured['rsi_overbought'] = (data_featured['RSI'] > 70).astype(int)
data_featured['rsi_oversold'] = (data_featured['RSI'] < 30).astype(int)
data_featured['price_above_bb'] = (data_featured['close'] > data_featured['BB_upper']).astype(int)
data_featured['price_below_bb'] = (data_featured['close'] < data_featured['BB_lower']).astype(int)

# Time-based features
data_featured['hour'] = data_featured.index.hour
data_featured['day_of_week'] = data_featured.index.dayofweek
data_featured['month'] = data_featured.index.month

# Remove rows with NaN (from lagging)
data_featured = data_featured.dropna()

print(f"✅ Feature engineering completed: {len(data_featured.columns)} total features")
print(f"   Samples: {len(data_featured)}")

# Show feature categories
feature_cols = [col for col in data_featured.columns if col not in ['open', 'high', 'low', 'close', 'volume', 'Buy_Signal', 'Sell_Signal', 'target']]
print(f"   Feature columns: {len(feature_cols)}")
print(f"   Sample features: {feature_cols[:10]}...")

## Step 4: Train Machine Learning Models

Train and evaluate multiple ML models for signal prediction.

In [None]:
print("🤖 Training machine learning models...")

# Prepare features and target
X = data_featured[feature_cols]
y = data_featured['target']

# Convert target to classification (0=hold, 1=buy, 2=sell)
y_class = y.copy()
y_class = y_class.replace({-1: 2})  # -1 (sell) -> 2

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=ML_CONFIG['test_size'], 
    random_state=ML_CONFIG['random_state'], stratify=y_class
)

print(f"📊 Data split:")
print(f"   Training: {len(X_train)} samples")
print(f"   Testing: {len(X_test)} samples")
print(f"   Features: {X.shape[1]}")

# Scale features for neural networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
models = {}
model_scores = {}

# Random Forest
if 'random_forest' in ML_CONFIG['models_to_train']:
    print("\n🌲 Training Random Forest...")
    from sklearn.ensemble import RandomForestClassifier
    
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=ML_CONFIG['random_state'],
        n_jobs=-1
    )
    
    rf_scores = cross_val_score(rf_model, X_train, y_train, cv=ML_CONFIG['cv_folds'])
    rf_model.fit(X_train, y_train)
    
    models['random_forest'] = rf_model
    model_scores['random_forest'] = {
        'cv_mean': rf_scores.mean(),
        'cv_std': rf_scores.std(),
        'test_score': rf_model.score(X_test, y_test)
    }
    
    print(f"   CV Score: {rf_scores.mean():.3f} (+/- {rf_scores.std() * 2:.3f})")
    print(f"   Test Score: {rf_model.score(X_test, y_test):.3f}")

# XGBoost
if 'xgboost' in ML_CONFIG['models_to_train']:
    print("\n🚀 Training XGBoost...")
    try:
        import xgboost as xgb
        
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=ML_CONFIG['random_state'],
            n_jobs=-1
        )
        
        xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=ML_CONFIG['cv_folds'])
        xgb_model.fit(X_train, y_train)
        
        models['xgboost'] = xgb_model
        model_scores['xgboost'] = {
            'cv_mean': xgb_scores.mean(),
            'cv_std': xgb_scores.std(),
            'test_score': xgb_model.score(X_test, y_test)
        }
        
        print(f"   CV Score: {xgb_scores.mean():.3f} (+/- {xgb_scores.std() * 2:.3f})")
        print(f"   Test Score: {xgb_model.score(X_test, y_test):.3f}")
        
    except ImportError:
        print("   ⚠️ XGBoost not available, skipping")

# LSTM
if 'lstm' in ML_CONFIG['models_to_train'] and is_lstm_available():
    print("\n🧠 Training LSTM...")
    
    # Prepare sequential data for LSTM
    sequence_length = 20
    X_seq = []
    y_seq = []
    
    for i in range(sequence_length, len(X_train_scaled)):
        X_seq.append(X_train_scaled[i-sequence_length:i])
        y_seq.append(y_train.iloc[i])
    
    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)
    
    print(f"   Sequential data: {X_seq.shape[0]} samples, {X_seq.shape[1]} timesteps, {X_seq.shape[2]} features")
    
    # Use TradPal's LSTM predictor
    lstm_predictor = LSTMSignalPredictor(
        input_size=X_seq.shape[2],
        hidden_size=64,
        num_layers=2,
        output_size=3  # 3 classes: hold, buy, sell
    )
    
    # Train LSTM (simplified)
    lstm_predictor.train(X_seq, y_seq, epochs=10, batch_size=32)
    
    models['lstm'] = lstm_predictor
    model_scores['lstm'] = {
        'cv_mean': 0.0,  # Would need proper CV implementation
        'cv_std': 0.0,
        'test_score': 0.0  # Would need evaluation
    }
    
    print("   LSTM training completed (basic implementation)")

print(f"\n✅ Model training completed: {len(models)} models trained")

## Step 5: Model Evaluation and Comparison

Evaluate model performance and compare results.

In [None]:
print("📊 Evaluating model performance...")

# Create comparison dataframe
results_df = pd.DataFrame.from_dict(model_scores, orient='index')
results_df = results_df.round(4)

print("\n🏆 Model Performance Comparison:")
print("=" * 50)
print(results_df)

# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# CV Scores
axes[0].bar(results_df.index, results_df['cv_mean'], yerr=results_df['cv_std'], 
           capsize=5, color=['skyblue', 'lightgreen', 'salmon'])
axes[0].set_title('Cross-Validation Scores')
axes[0].set_ylabel('Accuracy')
axes[0].set_ylim(0, 1)
axes[0].grid(True, alpha=0.3)

# Test Scores
axes[1].bar(results_df.index, results_df['test_score'], 
           color=['skyblue', 'lightgreen', 'salmon'])
axes[1].set_title('Test Set Scores')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0, 1)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Detailed evaluation for best model
best_model_name = results_df['test_score'].idxmax()
best_model = models[best_model_name]

print(f"\n🎯 Best performing model: {best_model_name.upper()}")
print(f"   Test Accuracy: {results_df.loc[best_model_name, 'test_score']:.3f}")

# Generate predictions for detailed evaluation
if best_model_name != 'lstm':  # Skip LSTM for now
    y_pred = best_model.predict(X_test)
    
    print("\n📋 Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Hold', 'Buy', 'Sell']))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Hold', 'Buy', 'Sell'],
                yticklabels=['Hold', 'Buy', 'Sell'])
    plt.title(f'Confusion Matrix - {best_model_name.upper()}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

## Step 6: Feature Importance Analysis

Analyze which features are most important for predictions.

In [None]:
print("🔍 Analyzing feature importance...")

# Feature importance for tree-based models
if 'random_forest' in models:
    rf_model = models['random_forest']
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    sns.barplot(data=top_features, x='importance', y='feature')
    plt.title('Top 20 Feature Importance - Random Forest')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("\n🔝 Top 10 Most Important Features:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"   {row['feature']}: {row['importance']:.4f}")

if 'xgboost' in models:
    xgb_model = models['xgboost']
    
    # Plot feature importance
    try:
        xgb.plot_importance(xgb_model, max_num_features=20)
        plt.title('Feature Importance - XGBoost')
        plt.tight_layout()
        plt.show()
    except:
        print("\n⚠️ Could not plot XGBoost feature importance")

# SHAP analysis if available
if is_shap_available() and 'random_forest' in models:
    print("\n🔮 Performing SHAP analysis...")
    
    try:
        import shap
        
        # Create explainer
        explainer = shap.TreeExplainer(rf_model)
        
        # Calculate SHAP values for a sample
        sample_data = X_test.head(100)  # Small sample for speed
        shap_values = explainer.shap_values(sample_data)
        
        # Summary plot
        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values, sample_data, max_display=10, show=False)
        plt.title('SHAP Feature Importance Summary')
        plt.tight_layout()
        plt.show()
        
        print("✅ SHAP analysis completed")
        
    except Exception as e:
        print(f"⚠️ SHAP analysis failed: {e}")
else:
    print("\n⚠️ SHAP not available or no suitable model for analysis")

## Step 7: Save Trained Models

Save the trained models for later use in signal prediction.

In [None]:
print("💾 Saving trained models...")

import joblib
import os
from datetime import datetime

# Create models directory if it doesn't exist
models_dir = os.path.join(project_root, 'cache', 'ml_models')
os.makedirs(models_dir, exist_ok=True)

# Save models and metadata
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
saved_models = {}

for model_name, model in models.items():
    try:
        if model_name == 'lstm':
            # Special handling for LSTM (PyTorch model)
            model_path = os.path.join(models_dir, f'{model_name}_model_{timestamp}.pth')
            # Note: Would need proper PyTorch save logic here
            print(f"   ⚠️ LSTM model saving not implemented in this demo")
        else:
            # Save sklearn models
            model_path = os.path.join(models_dir, f'{model_name}_model_{timestamp}.pkl')
            joblib.dump(model, model_path)
            saved_models[model_name] = model_path
            print(f"   ✅ Saved {model_name} to {os.path.basename(model_path)}")
            
    except Exception as e:
        print(f"   ❌ Failed to save {model_name}: {e}")

# Save scaler
scaler_path = os.path.join(models_dir, f'scaler_{timestamp}.pkl')
joblib.dump(scaler, scaler_path)
print(f"   ✅ Saved feature scaler to {os.path.basename(scaler_path)}")

# Save metadata
metadata = {
    'timestamp': timestamp,
    'symbol': ML_CONFIG['symbol'],
    'timeframe': ML_CONFIG['timeframe'],
    'training_period': f"{ML_CONFIG['start_date']} to {ML_CONFIG['end_date']}",
    'models_trained': list(models.keys()),
    'best_model': best_model_name,
    'feature_columns': feature_cols,
    'model_scores': model_scores,
    'scaler_path': scaler_path
}

metadata_path = os.path.join(models_dir, f'metadata_{timestamp}.json')
import json
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f"   ✅ Saved training metadata to {os.path.basename(metadata_path)}")
print(f"\n📁 Models saved to: {models_dir}")
print(f"\n💡 To use these models in TradPal, update the configuration in config/settings.py")

*TradPal v2.5.0 - Machine Learning for Educational Purposes Only*