In [1]:
# Cell 1: Import required libraries and functions
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
import numpy as np

# Cell 2: ML Model Training Functions
def create_ensemble_models_adaptive(data_size):
    """Create models adapted to the size of available data"""
    if data_size < 50:
        # Very small dataset - use simple models
        models = {
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=100,
                solver='liblinear',
                C=0.1  # More regularization
            )
        }
        print(f"    Using simple model for small dataset ({data_size} samples)")
    elif data_size < 100:
        # Small dataset - use regularized models
        models = {
            'random_forest': RandomForestClassifier(
                n_estimators=20,
                max_depth=5,
                min_samples_split=10,
                min_samples_leaf=5,
                random_state=42
            ),
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=200,
                solver='liblinear',
                C=0.1
            )
        }
        print(f"    Using regularized models for medium dataset ({data_size} samples)")
    else:
        # Larger dataset - use full ensemble
        models = {
            'random_forest': RandomForestClassifier(
                n_estimators=50,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42
            ),
            'gradient_boosting': GradientBoostingClassifier(
                n_estimators=50,
                max_depth=6,
                learning_rate=0.1,
                random_state=42
            ),
            'logistic_regression': LogisticRegression(
                random_state=42,
                max_iter=500,
                solver='liblinear',
                C=1.0
            )
        }
        
        # Add XGBoost for larger datasets
        try:
            models['xgboost'] = xgb.XGBClassifier(
                n_estimators=50,
                max_depth=6,
                learning_rate=0.1,
                random_state=42,
                use_label_encoder=False,
                eval_metric='logloss',
                verbosity=0
            )
        except Exception as e:
            print(f"    XGBoost not available: {e}")
        
        print(f"    Using full ensemble for large dataset ({data_size} samples)")
    
    return models

def prepare_ml_data_robust(data):
    """Prepare and clean data for ML training"""
    try:
        print(f"  Preparing ML data from {len(data)} rows...")
        
        # Select features for ML, excluding target and non-predictive columns
        exclude_cols = [
            'Target', 'Target_Return', 'Next_Close', 
            'Tech_Sentiment',  # Used separately as External_Sentiment
        ]
        
        # Get all potential feature columns
        all_feature_cols = [col for col in data.columns if col not in exclude_cols]
        
        # Keep only numeric columns
        numeric_cols = []
        for col in all_feature_cols:
            if pd.api.types.is_numeric_dtype(data[col]):
                numeric_cols.append(col)
        
        print(f"    Found {len(numeric_cols)} numeric features")
        
        # Create feature matrix and target
        X = data[numeric_cols].copy()
        y = data['Target'].copy()
        
        # Handle remaining NaN values more aggressively
        print(f"    Handling missing values...")
        
        # Count NaN values per column
        nan_counts = X.isna().sum()
        
        # Remove columns with too many NaN values (>50% missing)
        threshold = len(X) * 0.5
        good_cols = []
        for col in X.columns:
            if nan_counts[col] <= threshold:
                good_cols.append(col)
            else:
                print(f"      Removing {col}: {nan_counts[col]}/{len(X)} missing values")
        
        X = X[good_cols]
        
        # Fill remaining NaN values with median
        for col in X.columns:
            if X[col].isna().any():
                median_val = X[col].median()
                if pd.isna(median_val):  # If median is also NaN, use 0
                    median_val = 0
                n_filled = X[col].isna().sum()
                X[col].fillna(median_val, inplace=True)
                print(f"      Filled {n_filled} NaN values in {col} with {median_val:.3f}")
        
        # Handle infinite values
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(X.median())
        X = X.fillna(0)  # Final fallback
        
        # Remove features with zero variance
        from sklearn.feature_selection import VarianceThreshold
        variance_selector = VarianceThreshold(threshold=0.0)
        X_filtered = variance_selector.fit_transform(X)
        selected_features = X.columns[variance_selector.get_support()].tolist()
        X = pd.DataFrame(X_filtered, columns=selected_features, index=X.index)
        
        if len(selected_features) < len(numeric_cols):
            removed = len(numeric_cols) - len(selected_features)
            print(f"      Removed {removed} zero-variance features")
        
        # Ensure we have valid target values
        valid_mask = y.notna()
        X = X[valid_mask]
        y = y[valid_mask]
        
        # Final data checks
        print(f"    Final dataset: {X.shape[0]} samples, {X.shape[1]} features")
        
        # Check target distribution
        target_counts = y.value_counts()
        print(f"    Target distribution: {target_counts.to_dict()}")
        
        return X, y, selected_features
        
    except Exception as e:
        print(f"    Error preparing ML data: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None

def train_and_predict_robust(data):
    """Enhanced ML training with small dataset support"""
    if data is None:
        print("  No data provided for ML training")
        return None
    
    print(f"  Starting ML training with {len(data)} data points...")
    
    # Lower the minimum data requirement
    if len(data) < 20:
        print(f"  Insufficient data for ML training (need at least 20, got {len(data)})")
        return None
    
    try:
        # Prepare the data
        X, y, feature_cols = prepare_ml_data_robust(data)
        
        if X is None or len(X) < 15:
            print("  Failed to prepare sufficient data for ML")
            return None
        
        # Check class balance
        class_counts = y.value_counts()
        if len(class_counts) < 2:
            print(f"  Only one class present: {class_counts.to_dict()}")
            return None
        
        min_class_count = class_counts.min()
        if min_class_count < 3:
            print(f"  Insufficient class balance for ML: {class_counts.to_dict()}")
            print("  (Need at least 3 samples of each class)")
            return None
        
        print(f"    Class balance OK: {class_counts.to_dict()}")
        
        # Use adaptive test size based on total data
        if len(X) < 30:
            test_size = 0.2  # Keep more for training
        elif len(X) < 60:
            test_size = 0.25
        else:
            test_size = 0.3
        
        # Ensure minimum test size
        min_test_samples = max(3, min_class_count // 2)
        actual_test_size = max(test_size, min_test_samples / len(X))
        
        print(f"    Using test size: {actual_test_size:.2f}")
        
        # Split data
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                test_size=actual_test_size, 
                stratify=y, 
                random_state=42
            )
        except ValueError as e:
            print(f"    Stratified split failed ({e}), using random split")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                test_size=actual_test_size, 
                random_state=42
            )
        
        print(f"    Train: {len(X_train)} samples, Test: {len(X_test)} samples")
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Get adaptive models
        models = create_ensemble_models_adaptive(len(X_train))
        model_results = {}
        successful_models = 0
        
        for name, model in models.items():
            try:
                print(f"      Training {name}...")
                
                # Train the model
                model.fit(X_train_scaled, y_train)
                
                # Make predictions
                if len(X_test) > 0:
                    test_pred = model.predict(X_test_scaled)
                    test_proba = model.predict_proba(X_test_scaled)[:, 1]
                    test_accuracy = accuracy_score(y_test, test_pred)
                else:
                    # If no test set, use train set for evaluation (not ideal but better than failure)
                    test_pred = model.predict(X_train_scaled)
                    test_proba = model.predict_proba(X_train_scaled)[:, 1]
                    test_accuracy = accuracy_score(y_train, test_pred)
                    print(f"        Warning: Using training set for evaluation")
                
                # Train accuracy
                train_pred = model.predict(X_train_scaled)
                train_accuracy = accuracy_score(y_train, train_pred)
                
                # Check for overfitting
                overfitting = train_accuracy - test_accuracy
                
                model_results[name] = {
                    'model': model,
                    'train_accuracy': train_accuracy,
                    'test_accuracy': test_accuracy,
                    'prediction': test_pred[-1] if len(test_pred) > 0 else train_pred[-1],
                    'probability': test_proba[-1] if len(test_proba) > 0 else 0.5,
                    'overfitting': overfitting
                }
                
                successful_models += 1
                print(f"        ✓ Train={train_accuracy:.3f}, Test={test_accuracy:.3f}")
                
                if overfitting > 0.3:
                    print(f"        ⚠ High overfitting: {overfitting:.3f}")
                
            except Exception as e:
                print(f"        ❌ {name} failed: {e}")
                continue
        
        if successful_models == 0:
            print("  All ML models failed to train")
            return None
        
        # Ensemble prediction
        predictions = []
        probabilities = []
        weights = []
        
        for name, result in model_results.items():
            # Weight by test accuracy, penalize overfitting
            weight = result['test_accuracy'] * (1 - min(result['overfitting'], 0.5))
            weight = max(weight, 0.1)  # Minimum weight
            weights.append(weight)
            predictions.append(result['prediction'])
            probabilities.append(result['probability'])
        
        # Weighted ensemble
        weights = np.array(weights)
        weights = weights / weights.sum()
        
        ensemble_prediction = np.average(predictions, weights=weights)
        ensemble_probability = np.average(probabilities, weights=weights)
        average_accuracy = np.average([r['test_accuracy'] for r in model_results.values()])
        
        # Get feature importance from best model
        best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['test_accuracy'])
        best_model = model_results[best_model_name]['model']
        
        importance_dict = {}
        if hasattr(best_model, 'feature_importances_'):
            importance_dict[best_model_name] = dict(zip(feature_cols, best_model.feature_importances_))
        elif hasattr(best_model, 'coef_'):
            importance_dict[best_model_name] = dict(zip(feature_cols, np.abs(best_model.coef_[0])))
        
        result = {
            'prediction': ensemble_prediction,
            'probability': ensemble_probability,
            'accuracy': average_accuracy,
            'model_results': model_results,
            'feature_importance': importance_dict,
            'successful_models': successful_models,
            'best_model': best_model_name,
            'data_size': len(X)
        }
        
        print(f"  ✓ ML training complete: {successful_models} models, accuracy={average_accuracy:.3f}")
        return result
        
    except Exception as e:
        print(f"  ML training error: {e}")
        import traceback
        traceback.print_exc()
        return None

def train_and_predict(data):
    """Main ML training function with enhanced robustness for small datasets"""
    return train_and_predict_robust(data)

def generate_enhanced_recommendation(technical_data, sentiment_score, ml_result):
    """Generate trading recommendations"""
    if technical_data is None or len(technical_data) < 5:
        return "INSUFFICIENT_DATA"
    
    latest = technical_data.iloc[-1]
    
    # Technical analysis score (more detailed)
    technical_score = 0
    
    # Moving Average signals (stronger weight for clear trends)
    ma_signals = 0
    ma_count = 0
    
    # Check available moving averages
    if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
        if latest['Close'] > latest['SMA20']:
            ma_signals += 1
        else:
            ma_signals -= 1
        ma_count += 1
    
    if 'SMA50' in latest.index and pd.notna(latest['SMA50']):
        if latest['Close'] > latest['SMA50']:
            ma_signals += 1.5  # More weight for longer MA
        else:
            ma_signals -= 1.5
        ma_count += 1
    
    if 'SMA200' in latest.index and pd.notna(latest['SMA200']):
        if latest['Close'] > latest['SMA200']:
            ma_signals += 2  # Strong weight for long-term trend
        else:
            ma_signals -= 2
        ma_count += 1
    
    # Check MA crossovers
    if 'SMA20' in latest.index and 'SMA50' in latest.index:
        if pd.notna(latest['SMA20']) and pd.notna(latest['SMA50']):
            if latest['SMA20'] > latest['SMA50']:
                ma_signals += 1
            else:
                ma_signals -= 1
            ma_count += 1
    
    # Normalize MA signals
    if ma_count > 0:
        technical_score += ma_signals / ma_count * 3  # Scale up the MA influence
    
    # Momentum signals (RSI)
    if 'RSI' in latest.index and pd.notna(latest['RSI']):
        rsi = latest['RSI']
        if rsi < 25:  # Very oversold - strong buy signal
            technical_score += 3
        elif rsi < 35:  # Oversold - buy signal
            technical_score += 2
        elif rsi > 75:  # Very overbought - strong sell signal
            technical_score -= 3
        elif rsi > 65:  # Overbought - sell signal
            technical_score -= 2
        # Neutral zone (35-65) adds no score
    
    # MACD signals
    if 'MACD' in latest.index and 'Signal_Line' in latest.index:
        if pd.notna(latest['MACD']) and pd.notna(latest['Signal_Line']):
            if latest['MACD'] > latest['Signal_Line']:
                technical_score += 2
            else:
                technical_score -= 2
    
    # Trend strength (ADX)
    if 'ADX' in latest.index and pd.notna(latest['ADX']):
        adx = latest['ADX']
        if adx > 40:  # Strong trend
            # Determine trend direction from price vs MAs
            if 'SMA20' in latest.index and pd.notna(latest['SMA20']):
                if latest['Close'] > latest['SMA20']:
                    technical_score += 1  # Strong uptrend
                else:
                    technical_score -= 1  # Strong downtrend
    
    # Price momentum (recent performance)
    if 'ROC10' in latest.index and pd.notna(latest['ROC10']):
        roc = latest['ROC10']
        if roc > 5:  # Strong positive momentum
            technical_score += 2
        elif roc > 2:  # Positive momentum
            technical_score += 1
        elif roc < -5:  # Strong negative momentum
            technical_score -= 2
        elif roc < -2:  # Negative momentum
            technical_score -= 1
    
    # Normalize technical score (-10 to +10 range)
    technical_score = max(-10, min(10, technical_score))
    normalized_technical = technical_score / 10.0  # Scale to -1 to 1
    
    print(f"    Technical score: {technical_score}/10 ({normalized_technical:.2f})")
    
    # ML prediction weight
    ml_weight = 0
    ml_confidence = 0.5
    
    if ml_result:
        # Scale ML probability to -1 to 1 range
        ml_weight = (ml_result['probability'] - 0.5) * 2
        ml_confidence = ml_result.get('accuracy', 0.5)
        print(f"    ML weight: {ml_weight:.2f} (prob: {ml_result['probability']:.2f}, accuracy: {ml_confidence:.2f})")
    
    # Sentiment weight (already in -1 to 1 range)
    print(f"    Sentiment: {sentiment_score:.2f}")
    
    # Combine all scores with weights
    # Increased weights to make system more responsive
    if ml_confidence >= 0.6:
        # High ML confidence: 35% technical, 45% ML, 20% sentiment
        final_score = (normalized_technical * 0.35) + (ml_weight * 0.45) + (sentiment_score * 0.20)
    else:
        # Low ML confidence: 50% technical, 25% ML, 25% sentiment
        final_score = (normalized_technical * 0.50) + (ml_weight * 0.25) + (sentiment_score * 0.25)
    
    print(f"    Final combined score: {final_score:.3f}")
    
    # Generate recommendation with more aggressive thresholds
    if ml_confidence < 0.5:
        # Very low confidence - be very conservative
        if final_score > 0.6:
            return "WEAK_BUY"
        elif final_score < -0.6:
            return "WEAK_SELL"
        else:
            return "HOLD"
    elif ml_confidence < 0.65:
        # Medium confidence - somewhat conservative
        if final_score > 0.4:
            return "BUY"
        elif final_score > 0.15:
            return "WEAK_BUY"
        elif final_score < -0.4:
            return "SELL"
        elif final_score < -0.15:
            return "WEAK_SELL"
        else:
            return "HOLD"
    else:
        # High confidence - more aggressive recommendations
        if final_score > 0.25:
            return "STRONG_BUY"
        elif final_score > 0.1:
            return "BUY"
        elif final_score > 0.05:
            return "WEAK_BUY"
        elif final_score < -0.25:
            return "STRONG_SELL"
        elif final_score < -0.1:
            return "SELL"
        elif final_score < -0.05:
            return "WEAK_SELL"
        else:
            return "HOLD"