# **Comprehensive Crypto Market Prediction**

**Objective**: This notebook implements a complete solution for predicting short-term crypto price movements. The strategy involves in-depth data exploration, advanced feature engineering, a robust time-series validation framework, and a final ensemble model to maximize performance.

# Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr
import warnings
import gc

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

class Config:
    """Holds all major configuration parameters for the pipeline."""
    TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"

    # The exact feature list from the 0.107 script
    FEATURES = [
        "X863", "X856", "X344", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
        "X415", "X345", "X137", "X855", "X174", "X302", "X178", "X532", "X168", "X612",
        "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "X888", "X421", "X333"
    ]
    
    LABEL_COLUMN = "label"
    N_FOLDS = 3
    RANDOM_STATE = 42

    # Hyperparameters from the 0.107 script
    XGB_PARAMS = {
        "tree_method": "hist", "device": "gpu", "colsample_bylevel": 0.4778, 
        "colsample_bynode": 0.3628, "colsample_bytree": 0.7107, "gamma": 1.7095,
        "learning_rate": 0.02213, "max_depth": 20, "max_leaves": 12, 
        "min_child_weight": 16, "n_estimators": 1667, "subsample": 0.06567,
        "reg_alpha": 39.3524, "reg_lambda": 75.4484, "verbosity": 0, 
        "random_state": RANDOM_STATE
    }
    LGBM_PARAMS = {
        "boosting_type": "gbdt", "device": "cpu", "n_jobs": -1, "verbose": -1,
        "random_state": RANDOM_STATE, "colsample_bytree": 0.5039, "learning_rate": 0.01260,
        "min_child_samples": 20, "min_child_weight": 0.1146, "n_estimators": 915,
        "num_leaves": 145, "reg_alpha": 19.2447, "reg_lambda": 55.5046,
        "subsample": 0.9709, "max_depth": 9
    }
    
    LEARNERS = [
        {"name": "xgb", "Estimator": XGBRegressor, "params": XGB_PARAMS},
        {"name": "lgbm", "Estimator": LGBMRegressor, 'params': LGBM_PARAMS}
    ]

# Feature Engineering and Utility Functions


In [None]:
def create_time_decay_weights(n: int, decay: float = 0.95) -> np.ndarray:
    """Creates exponentially decaying weights, giving more importance to recent data."""
    positions = np.arange(n)
    normalized = positions / (n - 1)
    weights = decay ** (1.0 - normalized)
    return weights * n / weights.sum()

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Creates the same set of features as the high-scoring script."""
    df = df.copy()
    df['volume_weighted_sell'] = df['sell_qty'] * df['volume']
    df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-8)
    df['selling_pressure'] = df['sell_qty'] / (df['volume'] + 1e-8)
    df['effective_spread_proxy'] = np.abs(df['buy_qty'] - df['sell_qty']) / (df['volume'] + 1e-8)
    df['log_volume'] = np.log1p(df['volume'])
    df['bid_ask_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-8)
    df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-8)
    df['liquidity_ratio'] = (df['bid_qty'] + df['ask_qty']) / (df['volume'] + 1e-8)
    return df

def get_model_slices(n_samples: int):
    """Defines the different time-based windows of the training data."""
    return [
        {"name": "full_data", "cutoff": 0},
        {"name": "last_75pct", "cutoff": int(0.25 * n_samples)},
        {"name": "last_50pct", "cutoff": int(0.50 * n_samples)},
    ]

print("Utility functions and feature engineering logic defined.")

In [None]:
print("--- Loading and Preparing Data ---")

train_df = pd.read_parquet(Config.TRAIN_PATH, columns=Config.FEATURES + [Config.LABEL_COLUMN])
test_df = pd.read_parquet(Config.TEST_PATH, columns=Config.FEATURES)

print(f"Loaded data - Train: {train_df.shape}, Test: {test_df.shape}")

train_df = feature_engineering(train_df).dropna().reset_index(drop=True)
test_df = feature_engineering(test_df).fillna(0)

# Update feature list to include newly engineered features
Config.FEATURES = [col for col in train_df.columns if col != Config.LABEL_COLUMN]
X = train_df[Config.FEATURES]
y = train_df[Config.LABEL_COLUMN]
X_test = test_df[Config.FEATURES]

print(f"Final data - Train: {X.shape}, Test: {X_test.shape}")

# Data Loading and Preparation

In [None]:
def load_and_process_data():
    """Load, engineer features, and select best features efficiently"""
    print("Loading data...")
    
    # Load only necessary columns initially for memory efficiency
    initial_cols = Config.CORE_FEATURES + [Config.LABEL_COLUMN]
    train_df = pd.read_parquet(Config.TRAIN_PATH, columns=initial_cols)
    test_df = pd.read_parquet(Config.TEST_PATH, columns=Config.CORE_FEATURES)
    submission_df = pd.read_csv(Config.SUBMISSION_PATH)
    
    print(f"Initial data loaded - Train: {train_df.shape}, Test: {test_df.shape}")
    
    # Apply feature engineering
    print("Engineering features...")
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)
    
    # Remove original base features (keep engineered ones)
    to_remove = ["bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]
    train_df = train_df.drop(columns=to_remove)
    test_df = test_df.drop(columns=to_remove)
    
    print(f"After feature engineering - Train: {train_df.shape}, Test: {test_df.shape}")
    
    # Smart feature selection on recent data
    print("Performing smart feature selection...")
    selected_features = smart_feature_selection(
        train_df, 
        Config.LABEL_COLUMN, 
        sample_size=Config.FEATURE_SELECTION_SAMPLE_SIZE,
        top_k=Config.TARGET_FEATURES
    )
    
    # Keep only selected features
    train_df = train_df[selected_features + [Config.LABEL_COLUMN]]
    test_df = test_df[selected_features]
    
    # Memory optimization
    print("Optimizing memory usage...")
    train_df = reduce_mem_usage(train_df, "train")
    test_df = reduce_mem_usage(test_df, "test")
    
    # Update config with selected features
    Config.FEATURES = selected_features
    
    print(f"Final data - Train: {train_df.shape}, Test: {test_df.shape}")
    print(f"Selected features: {len(Config.FEATURES)}")
    
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True), submission_df

# Time-Slice Ensemble Training


In [None]:
def train_xgb_model(X_train, y_train, X_valid, y_valid, X_test, sample_weights=None):
    """Train optimized XGBoost model"""
    model = XGBRegressor(**XGB_PARAMS)
    
    # Fit with early stopping for efficiency
    model.fit(
        X_train, y_train, 
        sample_weight=sample_weights,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False
    )
    
    valid_pred = model.predict(X_valid)
    test_pred = model.predict(X_test)
    
    return valid_pred, test_pred, model



def train_and_evaluate(train_df, test_df):
    """Train models with focus on recent data patterns"""
    n_samples = len(train_df)
    model_slices = get_model_slices(n_samples)
    
    # Initialize predictions
    oof_preds = {s["name"]: np.zeros(n_samples) for s in model_slices}
    test_preds = {s["name"]: np.zeros(len(test_df)) for s in model_slices}
    feature_importance = {s["name"]: np.zeros(len(Config.FEATURES)) for s in model_slices}
    
    kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):
        print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")
        X_valid = train_df.iloc[valid_idx][Config.FEATURES]
        y_valid = train_df.iloc[valid_idx][Config.LABEL_COLUMN]
        X_test = test_df[Config.FEATURES]
        
        for s in model_slices:
            cutoff = s["cutoff"]
            slice_name = s["name"]
            
            # Use recent data slice
            recent_df = train_df.iloc[cutoff:].reset_index(drop=True)
            rel_idx = train_idx[train_idx >= cutoff] - cutoff
            
            if len(rel_idx) == 0:
                continue
                
            X_train = recent_df.iloc[rel_idx][Config.FEATURES]
            y_train = recent_df.iloc[rel_idx][Config.LABEL_COLUMN]
            
            # Create time decay weights for recent emphasis
            sample_weights = create_time_decay_weights(len(recent_df))[rel_idx]
            
            print(f"  Training {slice_name}: {len(X_train)} samples")
            
            try:
                valid_pred, test_pred, model = train_xgb_model(
                    X_train, y_train, X_valid, y_valid, X_test, sample_weights
                )
                
                # Store predictions for validation samples in this slice
                mask = valid_idx >= cutoff
                if mask.any():
                    oof_preds[slice_name][valid_idx[mask]] = valid_pred[mask]
                
                # For samples outside the slice, use the most comprehensive slice
                if cutoff > 0 and (~mask).any():
                    oof_preds[slice_name][valid_idx[~mask]] = \
                        oof_preds["recent_95pct"][valid_idx[~mask]]
                
                test_preds[slice_name] += test_pred
                feature_importance[slice_name] += model.feature_importances_
                
                # Compute validation score
                valid_corr = pearsonr(y_valid, valid_pred)[0]
                print(f"    {slice_name} validation correlation: {valid_corr:.4f}")
                
            except Exception as e:
                print(f"    Error in {slice_name}: {str(e)}")
                continue
    
    # Average test predictions across folds
    for slice_name in test_preds:
        test_preds[slice_name] /= Config.N_FOLDS
        feature_importance[slice_name] /= Config.N_FOLDS
    
    return oof_preds, test_preds, feature_importance





# Final Ensemble and Submission

In [None]:
# ===== Ensemble and Submission =====
def create_smart_ensemble(train_df, oof_preds, test_preds):
    """Create weighted ensemble based on recent performance"""
    print("\nEvaluating slice performance...")
    
    slice_scores = {}
    ensemble_weights = {}
    
    for slice_name in oof_preds:
        # Evaluate on recent data (more relevant for crypto)
        recent_idx = int(0.8 * len(train_df))  # Last 20% for evaluation
        recent_true = train_df.iloc[recent_idx:][Config.LABEL_COLUMN]
        recent_pred = oof_preds[slice_name][recent_idx:]
        
        # Remove zeros (unvalidated samples)
        valid_mask = recent_pred != 0
        if valid_mask.sum() > 0:
            score = pearsonr(recent_true[valid_mask], recent_pred[valid_mask])[0]
            slice_scores[slice_name] = score
            print(f"  {slice_name}: {score:.4f} (recent data correlation)")
        else:
            slice_scores[slice_name] = 0
    
    # Compute ensemble weights (higher weight for better recent performance)
    total_score = sum(max(0, score) for score in slice_scores.values())
    if total_score > 0:
        ensemble_weights = {k: max(0, v) / total_score for k, v in slice_scores.items()}
    else:
        # Equal weights if all scores are poor
        ensemble_weights = {k: 1.0 / len(slice_scores) for k in slice_scores}
    
    print("\nEnsemble weights:")
    for slice_name, weight in ensemble_weights.items():
        print(f"  {slice_name}: {weight:.3f}")
    
    # Create weighted ensemble
    ensemble_test = np.zeros(len(test_preds[list(test_preds.keys())[0]]))
    for slice_name, weight in ensemble_weights.items():
        ensemble_test += weight * test_preds[slice_name]
    
    return ensemble_test, slice_scores, ensemble_weights

def create_submission(train_df, oof_preds, test_preds, submission_df):
    """Create optimized submission"""
    
    # Create smart ensemble
    ensemble_pred, slice_scores, weights = create_smart_ensemble(train_df, oof_preds, test_preds)
    
    # Evaluate ensemble performance
    best_slice = max(slice_scores.items(), key=lambda x: x[1])
    print(f"\nBest individual slice: {best_slice[0]} ({best_slice[1]:.4f})")
    
    # Create submission
    submission = submission_df.copy()
    submission["prediction"] = ensemble_pred
    submission.to_csv("submission.csv", index=False)
    
    print(f"\nSubmission created with ensemble prediction")
    print(f"Ensemble uses {len([w for w in weights.values() if w > 0.01])} slices")
    
    return ensemble_pred

# Main function

In [None]:
# ===== Main Execution =====
if __name__ == "__main__":
    print("=== Enhanced Crypto Prediction with Smart Feature Selection ===\n")
    
    # Load and process data
    print("Step 1: Loading and processing data...")
    train_df, test_df, submission_df = load_and_process_data()
    
    # Train models
    print("\nStep 2: Training models on recent data slices...")
    oof_preds, test_preds, feature_importance = train_and_evaluate(train_df, test_df)
    
    # Create submission
    print("\nStep 3: Creating optimized submission...")
    final_pred = create_submission(train_df, oof_preds, test_preds, submission_df)
    
    # Print feature importance
    print("\nTop 15 most important features:")
    avg_importance = np.mean(list(feature_importance.values()), axis=0)
    feature_importance_pairs = list(zip(Config.FEATURES, avg_importance))
    feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)
    
    for i, (feat, imp) in enumerate(feature_importance_pairs[:15]):
        print(f"  {i+1:2d}. {feat:35s} - Importance: {imp:.4f}")
    
    print("\n=== Processing Complete! ===")
    print("Files created:")
    print("- submission.csv (optimized ensemble)")
    print(f"- Used {len(Config.FEATURES)} carefully selected features")
    print(f"- Focused on recent crypto market patterns")