Merging the datasets into one dataset

In [1]:
import pandas as pd
import warnings
from typing import Optional

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def load_and_merge_data(sentiment_file: str, trader_file: str) -> Optional[pd.DataFrame]:
    """
    Loads, cleans, and merges trader and sentiment data into a single DataFrame.

    This function handles the core preprocessing pipeline. It loads 
    high-frequency trader data and daily market sentiment. It robustly 
    parses multiple timestamp formats (IST strings and Unix ms) from the 
    trader data.
    
    It then filters for relevant trades (only those with a non-zero PnL) 
    and merges the daily sentiment onto each individual trade using the 
    'date' as a common key. Missing sentiment values (e.g., on weekends) 
    are forward-filled.

    Args:
        sentiment_file (str): 
            Filepath for the sentiment data (e.g., 'fear_greed_index.csv').
        trader_file (str): 
            Filepath for the historical trader data (e.g., 'historical_data.csv').

    Returns:
        Optional[pd.DataFrame]: 
            A fully preprocessed and merged DataFrame containing one row 
            per relevant trade, with corresponding sentiment data.
            Returns `None` if files cannot be loaded or if no relevant 
            trades are found after filtering.
    """
    print("Loading data...")
    try:
        df_sentiment = pd.read_csv(sentiment_file)
        df_trader = pd.read_csv(trader_file)
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure files are in the correct path.")
        return None

    print("Preprocessing...")
    
    # --- 1. Preprocess Sentiment Data ---
    
    # Standardize sentiment 'date' column to datetime objects for merging
    df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])
    
    # Select and rename relevant sentiment columns for clarity and easier access
    df_sentiment = df_sentiment[['date', 'value', 'classification']]
    df_sentiment = df_sentiment.rename(columns={'value': 'sentiment_value', 
                                              'classification': 'sentiment_class'})

    # --- 2. Preprocess Trader Data (Robust Timestamp Conversion) ---
    
    # First, attempt to parse the 'Timestamp IST' string (DD-MM-YYYY format)
    df_trader['Timestamp'] = pd.to_datetime(df_trader['Timestamp IST'], dayfirst=True, errors='coerce')
    
    # Identify rows where the 'Timestamp IST' parsing failed (resulted in NaT)
    failed_mask = df_trader['Timestamp'].isna()
    
    # For *only* the failed rows, attempt to parse the numeric 'Timestamp' column (Unix ms)
    if failed_mask.any():
        print(f"Reparsing {failed_mask.sum()} rows using Unix timestamp...")
        # .loc ensures we only update the failed rows, preserving the successfully parsed ones
        df_trader.loc[failed_mask, 'Timestamp'] = pd.to_datetime(
            df_trader.loc[failed_mask, 'Timestamp'], unit='ms', errors='coerce'
        )

    # Drop any rows that failed *both* parsing attempts
    df_trader = df_trader.dropna(subset=['Timestamp'])
    
    # Create the 'date' merge key by normalizing the timestamp (sets time to 00:00:00)
    df_trader['date'] = df_trader['Timestamp'].dt.normalize()
    
    # --- 3. Filter for Relevant Trades & Convert Types ---
    
    # Ensure key financial columns are numeric, coercing any non-numeric values to NaN
    numeric_cols = ['Closed PnL', 'Size USD', 'Execution Price']
    for col in numeric_cols:
        df_trader[col] = pd.to_numeric(df_trader[col], errors='coerce')
    
    # Create the modeling dataset by selecting *only* closing trades (where PnL is non-zero)
    # This filters out open positions or other non-trade events.
    df_model_data = df_trader[df_trader['Closed PnL'] != 0].copy()
    
    # Drop rows with missing critical values (PnL, Size, Account)
    df_model_data = df_model_data.dropna(subset=['Closed PnL', 'Size USD', 'Account'])
    
    if df_model_data.empty:
        print("Error: No data remaining after filtering for non-zero PnL.")
        return None

    # --- 4. Merge Datasets ---
    print("Merging datasets...")
    
    # --- Debugging: Confirm 'date' key exists in both DataFrames before merging ---
    print(f"Columns in df_model_data: {df_model_data.columns.to_list()}")
    print(f"Columns in df_sentiment: {df_sentiment.columns.to_list()}")
    # --------------------------------------------------------------------------
    
    # Perform a left merge: keep all trades, add sentiment data where it exists
    df_merged = pd.merge(df_model_data, df_sentiment, on='date', how='left')
    
    # Forward-fill missing sentiment values. This is crucial for trades on
    # weekends or holidays when the sentiment index might not update.
    df_merged['sentiment_value'] = df_merged['sentiment_value'].ffill()
    df_merged['sentiment_class'] = df_merged['sentiment_class'].ffill()
    
    # Drop any remaining rows where sentiment could not be filled (e.g., very old trades)
    df_merged = df_merged.dropna()
    
    print(f"Data ready. Total relevant trades for modeling: {len(df_merged)}")
    
    return df_merged

In [2]:
# --- Cell 1: Load and Merge ---

# Import the necessary functions (assuming 'load_and_merge_data' 
# is defined in this cell or in an imported file like 'Helper_Functions.py')
# from Helper_Functions import load_and_merge_data 

print("--- Starting Step 1: Load and Merge Data ---")

# --- Configuration ---
# Define file paths for the datasets.
# Using raw strings (r"...") is a best practice on Windows to prevent
# backslashes from being interpreted as escape characters.
SENTIMENT_FILE = r"C:\Users\lenovo\Downloads\fear_greed_index.csv"
TRADER_FILE = r"C:\Users\lenovo\Downloads\historical_data.csv"
# ---------------------

# Execute the data loading and preprocessing pipeline
# This function (defined previously) will load, clean, merge, 
# and filter the data.
df_merged = load_and_merge_data(SENTIMENT_FILE, TRADER_FILE)

# --- Post-Load Validation ---
# It's crucial to check if the function successfully returned a DataFrame.
# If `df_merged` is None, it means an error occurred during loading 
# (e.g., FileNotFoundError or no data left after filtering).
if df_merged is not None:
    # Confirmation message for a successful run
    print("\nSuccessfully loaded and merged data.")
    
    # Display the first 5 rows of the final DataFrame
    # This is a quick "sanity check" to ensure the merge and 
    # new columns (like 'sentiment_value') look correct.
    print(df_merged.head())
else:
    # Error message if the previous step failed
    print("\nData loading and merging failed. Please check file paths and function logs.")

--- Starting Step 1: Load and Merge Data ---
Loading data...
Preprocessing...
Merging datasets...
Columns in df_model_data: ['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp', 'date']
Columns in df_sentiment: ['date', 'sentiment_value', 'sentiment_class']
Data ready. Total relevant trades for modeling: 104408

Successfully loaded and merged data.
                                      Account  Coin  Execution Price  \
0  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           9.0570   
1  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           9.0570   
2  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           9.0480   
3  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           9.0464   
4  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  @107           9.0424   

   Size Tokens  Size USD  Side     Timestamp IST  Start Pos

Feature Engineering

In [3]:
# --- Cell 2: Engineer Features ---

import pandas as pd
from typing import List, Tuple, Optional

def engineer_features(df: pd.DataFrame) -> Optional[Tuple[pd.DataFrame, List[str], str]]:
    """
    Engineers predictive features and creates the target variable for modeling.

    This function takes the merged trade and sentiment DataFrame and performs
    several critical feature engineering steps:
    1.  Sorts data by time to ensure chronological order for historical features.
    2.  Creates the binary target variable 'is_profitable' (1 if PnL > 0, else 0).
    3.  Calculates historical "lagged" features for each trader, such as
        their win rate and average PnL over their last 5 trades. This is
        done using a grouped shift-rolling operation to prevent data leakage.
    4.  Performs one-hot encoding on categorical features ('sentiment_class', 
        'Side', 'Coin') to convert them into a machine-readable format.
    5.  Defines the final list of features (X) and the target (y) and cleans
        up any rows with NaNs resulting from the rolling operations.

    Args:
        df (pd.DataFrame): 
            The merged DataFrame from `load_and_merge_data`. 
            Must contain 'Timestamp', 'Closed PnL', 'Account', 
            'sentiment_class', 'Side', and 'Coin'.

    Returns:
        Optional[Tuple[pd.DataFrame, List[str], str]]: 
            A tuple containing:
            1.  `df_final` (pd.DataFrame): The DataFrame with all features and target.
            2.  `features` (List[str]): The list of all feature column names.
            3.  `target` (str): The name of the target column ('is_profitable').
            Returns (None, None, None) if the DataFrame becomes empty
            after engineering.
    """
    print("Engineering features...")
    
    # Sort by 'Timestamp' to ensure chronological order.
    # This is essential for calculating rolling features correctly.
    df = df.sort_values(by='Timestamp')
    
    # --- 1. Create Target Variable (y) ---
    
    # Create the binary target variable: 1 for a profitable trade, 0 otherwise.
    df['is_profitable'] = (df['Closed PnL'] > 0).astype(int)
    
    # --- 2. Create Historical Trader Features (Lagged Features) ---
    
    # Group by 'Account' to ensure a trader's history is only calculated from
    # their *own* past trades, preventing data leakage between traders.
    
    # --- Trader Win Rate (Past 5 Trades) ---
    # .shift(1): Looks at the *previous* trade's profitability (prevents using
    #            the current trade's outcome to predict itself).
    # .rolling(window=5, min_periods=1): Creates a 5-trade rolling window.
    #                                     min_periods=1 allows calculation
    #                                     for a trader's first few trades.
    # .mean(): Calculates the average (win rate) within that window.
    df['trader_win_rate_past_5'] = df.groupby('Account')['is_profitable'] \
                                     .shift(1) \
                                     .rolling(window=5, min_periods=1) \
                                     .mean()
                                     
    # --- Trader Average PnL (Past 5 Trades) ---
    # Same logic, but calculates the average 'Closed PnL' instead of win rate.
    # This captures the *magnitude* of their recent wins/losses.
    df['trader_avg_pnl_past_5'] = df.groupby('Account')['Closed PnL'] \
                                    .shift(1) \
                                    .rolling(window=5, min_periods=1) \
                                    .mean()

    # --- 3. Handle Categorical Features (One-Hot Encoding) ---
    
    # Convert categorical text data into numeric format.
    # 'dummy_na=False' ensures we don't create a separate column for NaN values.
    df = pd.get_dummies(df, columns=['sentiment_class', 'Side', 'Coin'], 
                        dummy_na=False)

    # --- 4. Finalize Feature List ---
    
    # Define the core numeric features
    features = [
        'sentiment_value',      # The raw Fear & Greed score
        'Size USD',             # The size of the trade
        'trader_win_rate_past_5', # Our engineered historical feature
        'trader_avg_pnl_past_5'   # Our other engineered historical feature
    ]
    
    # Automatically find and add all the new one-hot encoded (OHE) columns
    ohe_cols = [col for col in df.columns if 'sentiment_class_' in col or 
                'Side_' in col or 'Coin_' in col]
    features.extend(ohe_cols)
    
    # --- 5. Clean up ---
    
    # The rolling operations created NaNs for the first few trades
    # (where no history was available). We must drop these rows.
    df_final = df.dropna(subset=features)
    
    if df_final.empty:
        print("Error: No data remaining after feature engineering (dropna).")
        return None, None, None
    
    # Define the target column name
    target = 'is_profitable'
    
    print("Feature engineering complete.")
    return df_final, features, target

# --- Call the function ---

print("\n--- Starting Step 2: Feature Engineering ---")

# Check if the 'df_merged' DataFrame from Cell 1 exists and is not None
if 'df_merged' in locals() and df_merged is not None:
    
    # Run the feature engineering pipeline
    df_final, features, target = engineer_features(df_merged)
    
    # Check if the engineering was successful
    if df_final is not None:
        print(f"\nSuccessfully engineered features.")
        # Optional: Print the feature list for review
        # print(f"Identified features: {features}")
        
        # Display the first 5 rows of the final dataset (target + features)
        # This is a sanity check to see the final data going into the model.
        print(df_final[[target] + features].head())
else:
    # Error if the required input DataFrame is missing
    print("Error: 'df_merged' not found. Please run Cell 1 successfully first.")


--- Starting Step 2: Feature Engineering ---
Engineering features...
Feature engineering complete.

Successfully engineered features.
       is_profitable  sentiment_value  Size USD  trader_win_rate_past_5  \
21020              0             72.0    641.22                1.000000   
21021              0             72.0  24884.62                0.500000   
21024              1             72.0  10219.21                0.333333   
21025              1             72.0   6752.75                0.500000   
21023              1             72.0   8986.03                0.600000   

       trader_avg_pnl_past_5  sentiment_class_Extreme Fear  \
21020             245.428491                         False   
21021             116.099246                         False   
21024             -93.934880                         False   
21025             -62.794722                         False   
21023             -46.472958                         False   

       sentiment_class_Extreme Greed  sen

Split & Scale data

In [4]:
# --- Cell 3: Split and Scale Data ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from typing import List, Dict, Any, Optional

def split_and_scale_data(df: pd.DataFrame, features: List[str], target: str) -> Optional[Dict[str, Any]]:
    """
    Splits time-series data, aligns features, and scales for modeling.

    This function performs the crucial data preparation step before modeling.
    It takes the final engineered DataFrame and:
    1.  Splits it into training (80%) and testing (20%) sets based on time.
        **Crucially, it does not shuffle the data**, preserving the
        chronological order, which is essential for time-series validation.
    2.  Aligns the feature columns between train and test sets. This handles
        any categorical features (from one-hot encoding) that might be
        present in one set but not the other, preventing errors during
        prediction.
    3.  Applies `StandardScaler` to the features. It fits the scaler *only*
        on the training data and then uses it to transform both the
        training and test sets. This prevents data leakage.
    4.  Packages all resulting data (scaled, unscaled, scaler object,
        and feature list) into a dictionary for easy access in later steps.

    Args:
        df (pd.DataFrame): 
            The final, cleaned DataFrame from `engineer_features`.
        features (List[str]): 
            The list of all feature column names to be used as model inputs (X).
        target (str): 
            The name of the target variable column (y).

    Returns:
        Optional[Dict[str, Any]]: 
            A dictionary (`data_packages`) containing all data needed for modeling:
            - "unscaled": (X_train, y_train, X_test, y_test)
            - "scaled": (X_train_scaled, y_train, X_test_scaled, y_test)
            - "scaler": The fitted StandardScaler object.
            - "feature_list": The ordered list of feature names.
            Returns `None` if the train or test split results in an empty DataFrame.
    """
    print("Splitting and scaling data...")
    
    # --- 1. Time-Series Split (80% train, 20% test) ---
    
    # Calculate the index for an 80/20 split
    train_size = int(len(df) * 0.8)
    
    # Split the data chronologically.
    # .iloc[:train_size] takes the first 80% (past data) for training.
    train_df = df.iloc[:train_size]
    # .iloc[train_size:] takes the last 20% (future data) for testing.
    test_df = df.iloc[train_size:]

    # Validation check to ensure both splits have data
    if train_df.empty or test_df.empty:
        print("Error: Train or test split resulted in an empty dataframe.")
        return None

    # Separate features (X) and target (y) for both sets
    X_train = train_df[features]
    y_train = train_df[target]
    X_test = test_df[features]
    y_test = test_df[target]

    print(f"Training set size: {len(X_train)} trades")
    print(f"Test set size: {len(X_test)} trades")

    # --- 2. Align Columns ---
    
    # This is critical if a rare category (e.g., a new 'Coin')
    # appears in the test set but was *not* in the training set.
    train_cols = X_train.columns
    test_cols = X_test.columns
    
    # Find columns in training set but not in test set
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0 # Add missing col to test set, fill with 0
        
    # Find columns in test set but not in training set
    missing_in_train = set(test_cols) - set(train_cols)
    for c in missing_in_train:
        X_train[c] = 0 # Add missing col to train set, fill with 0
        
    # Ensure the final column order is identical for both sets
    X_test = X_test[train_cols]

    # --- 3. Scaling ---
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Fit the scaler *ONLY* on the training data.
    # This calculates the mean and std dev of the training data.
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Transform the test data using the *same mean and std dev*
    # from the training data. This prevents "leaking" test set info.
    X_test_scaled = scaler.transform(X_test)

    print("Data successfully split and scaled.")
    
    # --- 4. Package for easy use ---
    
    # Store all data versions in a dictionary for convenient passing
    # to the modeling functions.
    data_packages = {
        # Unscaled data (for tree models like Random Forest, XGBoost)
        "unscaled": (X_train, y_train, X_test, y_test),
        # Scaled data (for models like LogReg, SVM, Deep Learning)
        "scaled": (X_train_scaled, y_train, X_test_scaled, y_test),
        # The fitted scaler (to transform new data in the future)
        "scaler": scaler,
        # The master list of features (for feature importance)
        "feature_list": train_cols 
    }
    
    return data_packages

# --- Call the function ---

print("\n--- Starting Step 3: Split and Scale Data ---")

# Check if the 'df_final' DataFrame from Cell 2 exists
if 'df_final' in locals() and df_final is not None:
    
    # Execute the splitting and scaling function
    data_packages = split_and_scale_data(df_final, features, target)
    
    # Check for successful execution
    if data_packages is not None:
        print("\n'data_packages' dictionary is ready.")
        print("It contains 'unscaled', 'scaled', 'scaler', and 'feature_list'.")
else:
    # Error if the required input DataFrame is missing
    print("Error: 'df_final' not found. Please run Cell 1 and Cell 2 successfully first.")


--- Starting Step 3: Split and Scale Data ---
Splitting and scaling data...
Training set size: 83525 trades
Test set size: 20882 trades
Data successfully split and scaled.

'data_packages' dictionary is ready.
It contains 'unscaled', 'scaled', 'scaler', and 'feature_list'.


Training Models

In [5]:
# --- Cell 4: Train and Evaluate Top 5 Models ---

import pandas as pd
from typing import Dict, Any, Tuple
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score)

# Import all the model types
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

# Import Deep Learning (TensorFlow/Keras) libraries
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def build_deep_learning_model(input_shape: int) -> Model:
    """
    Creates a simple Keras Sequential (MLP) model for binary classification.

    This defines a basic feedforward neural network architecture:
    Input -> Dense(64, relu) -> Dropout(0.3) -> Dense(32, relu) -> 
    Dropout(0.3) -> Output(1, sigmoid)

    Args:
        input_shape (int): 
            The number of input features (e.g., X_train.shape[1]).

    Returns:
        Model: 
            A compiled, untrained Keras model.
    """
    model = Sequential()
    
    # Input layer + first hidden layer. 'input_shape' is only needed for the first layer.
    model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
    
    # Dropout layer: Randomly sets 30% of input units to 0 during training
    # to help prevent overfitting.
    model.add(Dropout(0.3))
    
    # Second hidden layer
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    
    # Output layer: 1 neuron with a 'sigmoid' activation function
    # which squashes the output to a probability (0 to 1).
    model.add(Dense(1, activation='sigmoid')) 
    
    # Compile the model
    model.compile(optimizer='adam',
                  # 'binary_crossentropy' is the standard loss function for
                  # two-class (binary) classification problems.
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

def train_and_evaluate_models(data_packages: Dict[str, Any]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Trains, evaluates, and compares 5 different ML/DL models.

    This function performs a "bake-off" to see which model architecture
    performs best on the data "out of the box."
    
    It trains:
    1. Logistic Regression
    2. Random Forest Classifier
    3. XGBoost Classifier
    4. Support Vector Machine (SVC)
    5. A simple Deep Learning (MLP) model

    It correctly uses scaled data for models that require it (LogReg, SVM, DL)
    and unscaled data for tree-based models (RF, XGB).
    
    Args:
        data_packages (Dict[str, Any]): 
            The dictionary created by `split_and_scale_data`. Must contain
            keys "unscaled" and "scaled", each holding a tuple of
            (X_train, y_train, X_test, y_test).

    Returns:
        Tuple[pd.DataFrame, Dict[str, Any]]:
            1. `results_df` (pd.DataFrame): A DataFrame with models as the
               index and their performance metrics (Accuracy, Precision,
               Recall, F1, ROC-AUC) as columns.
            2. `trained_models` (Dict[str, Any]): A dictionary where keys
               are model names and values are the fitted model objects.
    """
    print("Training and evaluating all models...")
    
    # --- 1. Unpack all our data ---
    
    # Unscaled data (for tree-based models: Random Forest, XGBoost)
    X_train, y_train, X_test, y_test = data_packages['unscaled']
    
    # Scaled data (for LogReg, SVM, and Deep Learning)
    X_train_sc, _, X_test_sc, _ = data_packages['scaled']
    
    # --- 2. Define the models ---
    models = {
        "Logistic Regression": {
            "model": LogisticRegression(max_iter=1000, random_state=42),
            "use_scaled": True # Needs scaled data for convergence
        },
        "Random Forest": {
            # n_jobs=-1 uses all available CPU cores for training
            "model": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            "use_scaled": False # Tree models are not sensitive to feature scale
        },
        "XGBoost": {
            # eval_metric='logloss' is standard for binary classification
            "model": xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, 
                                     random_state=42),
            "use_scaled": False # Tree models are not sensitive to feature scale
        },
        "SVM (SVC)": {
            # probability=True is required to get predict_proba()
            # which is needed for ROC-AUC score. Can be slow.
            "model": SVC(probability=True, random_state=42), 
            "use_scaled": True # Needs scaled data
        },
        "Deep Learning (MLP)": {
            # Build the model, passing in the number of features
            "model": build_deep_learning_model(X_train_sc.shape[1]),
            "use_scaled": True # Needs scaled data
        }
    }
    
    results = [] # A list to store metric dicts
    trained_models = {} # A dict to store the fitted models
    
    # --- 3. Loop, Train, and Evaluate ---
    for name, m in models.items():
        print(f"--- Training {name} ---")
        
        # Select the correct dataset (scaled or unscaled)
        if m['use_scaled']:
            X_t, X_te = X_train_sc, X_test_sc
        else:
            X_t, X_te = X_train, X_test
            
        # --- Train ---
        if name == "Deep Learning (MLP)":
            # Keras/DL models have a different training/prediction API
            
            # EarlyStopping stops training when validation loss stops
            # improving, preventing overfitting and saving time.
            es = EarlyStopping(monitor='val_loss', patience=10, 
                               restore_best_weights=True)
            
            m['model'].fit(X_t, y_train,
                           epochs=100,       # Max number of passes over the data
                           batch_size=64,
                           # Use 10% of training data as a validation set
                           # for EarlyStopping.
                           validation_split=0.1, 
                           callbacks=[es],
                           verbose=0) # verbose=0 suppresses training logs
            
            # Predict probabilities
            y_pred_proba = m['model'].predict(X_te).ravel() # .ravel() flattens (n, 1) to (n,)
            # Convert probabilities to binary classes (0 or 1)
            y_pred = (y_pred_proba > 0.5).astype(int)
        
        else:
            # Standard sklearn model API
            m['model'].fit(X_t, y_train)
            
            # Predict binary classes
            y_pred = m['model'].predict(X_te)
            # Predict probabilities. [:, 1] selects the probability 
            # of the *positive class* (class 1).
            y_pred_proba = m['model'].predict_proba(X_te)[:, 1] 
            
        # --- Evaluate ---
        # Calculate all key classification metrics
        # zero_division=0 prevents warnings if a class is never predicted
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # Store results
        results.append({
            "Model": name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "ROC-AUC": roc_auc
        })
        
        # Store the fitted model object for later use
        trained_models[name] = m['model'] 
        print(f"--- {name} Complete ---")

    # Convert the list of results into a clean DataFrame
    results_df = pd.DataFrame(results).set_index("Model")
    return results_df, trained_models

# --- Call the function ---

print("\n--- Starting Step 4: Model Training & Evaluation ---")

# Check if the 'data_packages' dictionary from Cell 3 exists
if 'data_packages' in locals():
    
    # This line runs all the training and evaluation
    results_df, trained_models = train_and_evaluate_models(data_packages)
    
    print("\n--- Model Comparison Results ---")
    
    # Sort by ROC-AUC (Area Under the Curve), as it's a robust
    # metric for classification performance, especially with
    # potentially imbalanced classes.
    print(results_df.sort_values(by='ROC-AUC', ascending=False))
    
    # --- Store results for the next notebook cell ---
    # '%store' is a Jupyter "magic command" that saves variables
    # to be loaded in other sessions or notebooks.
    print("\nStoring results for Step 5...")
    %store results_df
    %store trained_models
    
else:
    # Error if the required input dictionary is missing
    print("Error: 'data_packages' not found. Please run all previous cells successfully first.")


--- Starting Step 4: Model Training & Evaluation ---
Training and evaluating all models...
--- Training Logistic Regression ---
--- Logistic Regression Complete ---
--- Training Random Forest ---
--- Random Forest Complete ---
--- Training XGBoost ---
--- XGBoost Complete ---
--- Training SVM (SVC) ---
--- SVM (SVC) Complete ---
--- Training Deep Learning (MLP) ---
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
--- Deep Learning (MLP) Complete ---

--- Model Comparison Results ---
                     Accuracy  Precision    Recall  F1-Score   ROC-AUC
Model                                                                 
XGBoost              0.952639   0.954180  0.983162  0.968454  0.983070
Logistic Regression  0.956087   0.961548  0.979794  0.970585  0.980927
SVM (SVC)            0.929269   0.927976  0.980442  0.953488  0.976186
Random Forest        0.934202   0.932166  0.982514  0.956678  0.975726
Deep Learning (MLP)  0.936261   0.954633  0.959394  0.95700

Hyperparameter tuning for the best model

In [6]:
# --- Cell 5: Hyperparameter Tuning & Final Analysis ---

from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import xgboost as xgb
import pandas as pd
import warnings
from typing import Dict, Any
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score)

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def tune_best_model(data_packages: Dict[str, Any]) -> xgb.XGBClassifier:
    """
    Performs hyperparameter tuning on the XGBoost model.

    This function uses `RandomizedSearchCV` to efficiently search a 
    predefined parameter grid. Crucially, it uses `TimeSeriesSplit` 
    for cross-validation to respect the chronological order of the data, 
    preventing data leakage and providing a realistic performance estimate.

    It optimizes for 'roc_auc', our primary metric for classification
    performance.

    Args:
        data_packages (Dict[str, Any]): 
            The dictionary from `split_and_scale_data`. Must contain
            the "unscaled" key with (X_train, y_train, X_test, y_test).

    Returns:
        xgb.XGBClassifier: 
            The best-performing, fitted XGBoost model object found 
            during the randomized search.
    """
    print("\n--- Hyperparameter Tuning Best Model (XGBoost) ---")
    
    # --- 1. Unpack Data ---
    # XGBoost (a tree-based model) does not require scaled data.
    X_train, y_train, X_test, y_test = data_packages['unscaled']
    
    # --- 2. Define the Model and Parameter Grid ---
    
    # Initialize the base XGBoost model
    xgb_model = xgb.XGBClassifier(eval_metric='logloss', 
                                use_label_encoder=False, 
                                random_state=42)
    
    # Define the "search space" for the tuner.
    # These are the hyperparameters we want to test.
    param_grid = {
        'n_estimators': [100, 200, 300, 500],      # Number of boosting rounds (trees)
        'learning_rate': [0.01, 0.05, 0.1, 0.2],   # Step size shrinkage to prevent overfitting
        'max_depth': [3, 5, 7, 10],                # Maximum depth of an individual tree
        'colsample_bytree': [0.6, 0.8, 1.0],       # Fraction of features used for each tree
        'subsample': [0.6, 0.8, 1.0],              # Fraction of data samples used for each tree
        'gamma': [0, 0.1, 0.5, 1]                  # Minimum loss reduction to make a split (regularization)
    }
    
    # --- 3. Set up Time-Series Cross-Validation ---
    
    # This is *critical* for time-series data.
    # It creates folds like:
    # Fold 1: train=[0], test=[1]
    # Fold 2: train=[0, 1], test=[2]
    # This ensures we always train on the past and validate on the future.
    tscv = TimeSeriesSplit(n_splits=5)
    
    # --- 4. Set up Randomized Search ---
    
    # RandomizedSearchCV is faster than GridSearchCV as it samples 
    # a fixed number of parameter combinations (n_iter).
    random_search = RandomizedSearchCV(
        estimator=xgb_model,                # The model to tune
        param_distributions=param_grid,     # The hyperparameter grid to search
        n_iter=25,                          # Number of random combinations to try
        scoring='roc_auc',                  # The metric to optimize
        cv=tscv,                            # Use our time-series cross-validator
        n_jobs=-1,                          # Use all available CPU cores
        verbose=1,                          # Print progress updates
        random_state=42
    )
    
    # --- 5. Run the Search ---
    print("Running RandomizedSearch... This may take a few minutes.")
    # This fits the 'random_search' object, which in turn
    # fits 'n_iter' * 'n_splits' models.
    random_search.fit(X_train, y_train)
    
    print("\nTuning complete.")
    # Display the best hyperparameter combination found
    print(f"Best parameters found: {random_search.best_params_}")
    # Display the mean cross-validated score of the best estimator
    print(f"Best ROC-AUC score during tuning: {random_search.best_score_:.4f}")
    
    # --- 6. Evaluate the Final Tuned Model ---
    
    # Get the best model identified by the search
    best_model = random_search.best_estimator_
    
    # Evaluate this single best model on the *held-back test set*
    # This is our final, unbiased measure of performance.
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    print("\n--- Tuned Model Performance on Test Set ---")
    print(f"Tuned Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Tuned Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Tuned Recall: {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Tuned F1-Score: {f1_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Tuned ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    return best_model

def show_feature_importance(model, data_packages: Dict[str, Any]):
    """
    Displays the top 10 most important features from the trained model.

    This function is most effective for tree-based models (like 
    Random Forest or XGBoost) that have a `feature_importances_` attribute.

    Args:
        model: 
            A fitted model object (e.g., the returned `best_model` 
            from `tune_best_model`).
        data_packages (Dict[str, Any]): 
            The dictionary from `split_and_scale_data`, used here to
            retrieve the "feature_list".
    """
    try:
        # Get the ordered list of feature names
        feature_list = data_packages['feature_list']
        # Get the importance scores from the fitted model
        importances = model.feature_importances_
        
        # Create a pandas Series for easy sorting and display
        feature_importance_df = pd.Series(
            importances, 
            index=feature_list
        ).sort_values(ascending=False) # Sort from most to least important

        print("\n--- Top 10 Most Important Features ---")
        # Display the top 10 features
        print(feature_importance_df.head(10))

    except Exception as e:
        print(f"Could not get feature importances: {e}")
        print("Note: This is easiest for tree-based models like XGBoost/Random Forest.")

# --- Call the functions ---

print("\n--- Starting Step 5: Hyperparameter Tuning & Final Analysis ---")

# Check if the 'data_packages' dictionary from Cell 3 exists
if 'data_packages' in locals():
    
    # 1. Run the tuning process
    # This will find the best model and print its test set performance.
    final_best_model = tune_best_model(data_packages)
    
    # 2. Show what the model learned
    # This prints the feature importances, providing key insights.
    show_feature_importance(final_best_model, data_packages)
    
else:
    # Error if the required input dictionary is missing
    print("Error: 'data_packages' not found. Please run all previous cells successfully first.")


--- Starting Step 5: Hyperparameter Tuning & Final Analysis ---

--- Hyperparameter Tuning Best Model (XGBoost) ---
Running RandomizedSearch... This may take a few minutes.
Fitting 5 folds for each of 25 candidates, totalling 125 fits

Tuning complete.
Best parameters found: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.5, 'colsample_bytree': 0.6}
Best ROC-AUC score during tuning: 0.9679

--- Tuned Model Performance on Test Set ---
Tuned Accuracy: 0.9528
Tuned Precision: 0.9541
Tuned Recall: 0.9835
Tuned F1-Score: 0.9686
Tuned ROC-AUC: 0.9841

--- Top 10 Most Important Features ---
trader_win_rate_past_5    0.600210
trader_avg_pnl_past_5     0.061947
Coin_@4                   0.027504
Coin_NIL                  0.015013
Coin_ETH                  0.013166
Coin_PENGU                0.012340
Coin_HYPE                 0.011256
Coin_ZRO                  0.008396
Coin_@107                 0.008261
Coin_SOL                  0.007726
dtype: float32
