---

**In this notebook, we will use Ensembling Gradient Boosting trees for a multi-target regression problem, leveraging lagged targets to predict 424 outputs. To speed up inference, we adopt the long-format multi-output prediction method, which is much faster than the standard multi-output approach.**

**You will also find several useful techniques in this notebook, including:**

* How to create lagged targets and use them in the prediction step
* How to run Ensembling Gradient Boosting trees with lags targets
* How to build an optimized prediction function for API inference



---

In [1]:
import numpy as np
import pandas as pd
import pickle
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Data loading and preprocessing
p = '/kaggle/input/mitsui-commodity-prediction-challenge/'
train = pd.read_csv(p+'train.csv')
trainl = pd.read_csv(p+'train_labels.csv')
traint = pd.read_csv(p+'target_pairs.csv')

def _handle_missing_values(data):
    """Improved missing value handling"""
    data.interpolate(method='polynomial', order=3, inplace=True)
    data.clip(lower=-10, upper=10, inplace=True)
    return data

train = _handle_missing_values(train)
trainl = _handle_missing_values(trainl)

target_lag_1 = traint.loc[traint["lag"]==1,"target"].values
target_lag_2 = traint.loc[traint["lag"]==2,"target"].values
target_lag_3 = traint.loc[traint["lag"]==3,"target"].values
target_lag_4 = traint.loc[traint["lag"]==4,"target"].values

Features = [i for i in trainl.columns]

def create_lagged_labels(df):
    dt = pd.DataFrame()
    dt["date_id"] = df["date_id"]
    for f in Features[1:]:
        if f in target_lag_1:
            lag = 1
        elif f in target_lag_2:
            lag = 2
        elif f in target_lag_3:
            lag = 3
        elif f in target_lag_4:
            lag = 4    
        dt[f] = df[f].shift(lag).fillna(0)
    return df, dt

_, train_lagged = create_lagged_labels(trainl)

# Create training data in long format
import gc
training_df = []
target_cols = [f"target_{i}" for i in range(424)]
for j, target_col in enumerate(target_cols):
    temp_train_df = pd.DataFrame()
    temp_train_df[Features] = train_lagged[Features]                     
    temp_train_df['target_id'] = j
    y = trainl[target_col].values
    temp_train_df['target'] = y
    mask = ~(np.isnan(y) | np.isinf(y) | (np.abs(y) > 1e10))
    training_df.append(temp_train_df[mask].copy())
    del temp_train_df, y
    gc.collect()

training_df = pd.concat(training_df).reset_index(drop=True)
Features2 = Features + ["target_id"]
X_train = training_df[Features2]
y_train = training_df["target"]

print("Data preparation completed!")
print(f"Training data shape: {X_train.shape}")
print(f"Features: {Features2}")

Data preparation completed!
Training data shape: (831129, 426)
Features: ['date_id', 'target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9', 'target_10', 'target_11', 'target_12', 'target_13', 'target_14', 'target_15', 'target_16', 'target_17', 'target_18', 'target_19', 'target_20', 'target_21', 'target_22', 'target_23', 'target_24', 'target_25', 'target_26', 'target_27', 'target_28', 'target_29', 'target_30', 'target_31', 'target_32', 'target_33', 'target_34', 'target_35', 'target_36', 'target_37', 'target_38', 'target_39', 'target_40', 'target_41', 'target_42', 'target_43', 'target_44', 'target_45', 'target_46', 'target_47', 'target_48', 'target_49', 'target_50', 'target_51', 'target_52', 'target_53', 'target_54', 'target_55', 'target_56', 'target_57', 'target_58', 'target_59', 'target_60', 'target_61', 'target_62', 'target_63', 'target_64', 'target_65', 'target_66', 'target_67', 'target_68', 'target_69', 'target_70', 't

In [2]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

class MultiTargetEnsemble:
    def __init__(self, use_gpu=True, target_cols=None, quick_test=False):
        self.use_gpu = use_gpu
        self.target_cols = target_cols or [f"target_{i}" for i in range(424)]
        self.models = {}
        self.is_fitted = False
        self.feature_names = []
        self.training_params = {}
        self.quick_test = quick_test
        
    def _initialize_model(self, model_type):
        """Initialize individual model with optimized parameters"""
        if self.quick_test:
            base_params = {
                'n_estimators': 10,
                'learning_rate': 0.1,
                'random_state': 42,
                'early_stopping_rounds': 5
            }
        else:
            base_params = {
                'n_estimators': 2000,
                'learning_rate': 0.01,
                'random_state': 42,
                'early_stopping_rounds': 50
            }
        
        if model_type == 'xgb':
            return XGBRegressor(
                **base_params,
                max_depth=6 if self.quick_test else 8,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=1,
                reg_lambda=1,
                tree_method="hist" if self.use_gpu else "auto",
                device="cuda" if self.use_gpu else "cpu",
                eval_metric='rmse'
            )
        elif model_type == 'lgbm':
            return LGBMRegressor(
                **base_params,
                max_depth=6 if self.quick_test else 8,
                num_leaves=32 if self.quick_test else 128,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=1,
                reg_lambda=1,
                device="gpu" if self.use_gpu else "cpu",
                n_jobs=-1,
                verbose=-1
            )
        elif model_type == 'catboost':
            return CatBoostRegressor(
                iterations=base_params['n_estimators'],
                learning_rate=base_params['learning_rate'],
                random_seed=base_params['random_state'],
                early_stopping_rounds=base_params['early_stopping_rounds'],
                depth=6 if self.quick_test else 8,
                l2_leaf_reg=3,
                task_type="GPU" if self.use_gpu else "CPU",
                verbose=False,
                loss_function='RMSE'
            )
    
    def prepare_validation_data(self, X, y, validation_size=0.2):
        """Prepare validation data maintaining temporal structure"""
        n_samples = len(X)
        split_idx = int(n_samples * (1 - validation_size))
        return X.iloc[:split_idx], X.iloc[split_idx:], y.iloc[:split_idx], y.iloc[split_idx:]
    
    def fit_individual_target(self, target_id, X_train, y_train, X_val=None, y_val=None):
        """Train models for a specific target"""
        target_models = {}
        
        for model_type in ['xgb', 'lgbm', 'catboost']:
            if self.quick_test:
                print(f"Quick training {model_type} for target {target_id}...")
            else:
                print(f"Training {model_type} for target {target_id}...")
            
            model = self._initialize_model(model_type)
            
            # Prepare evaluation set
            eval_set = None
            if X_val is not None and y_val is not None:
                if model_type == 'catboost':
                    eval_set = [(X_val, y_val)]
                else:
                    eval_set = [(X_train, y_train), (X_val, y_val)]
            
            verbose_level = 10 if self.quick_test else 100
            
            try:
                if model_type == 'xgb':
                    model.fit(X_train, y_train, eval_set=eval_set, verbose=verbose_level)
                elif model_type == 'lgbm':
                    model.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse')
                elif model_type == 'catboost':
                    model.fit(X_train, y_train, eval_set=eval_set, verbose=verbose_level)
                
                target_models[model_type] = model
            except Exception as e:
                print(f"Error training {model_type} for target {target_id}: {e}")
                continue
        
        return target_models
    
    def fit(self, X, y, target_ids, validation_size=0.2, max_targets=None):
        """Train models for all targets"""
        self.feature_names = X.columns.tolist() if hasattr(X, 'columns') else []
        
        self.training_params = {
            'feature_names': self.feature_names,
            'validation_size': validation_size,
            'n_targets': target_ids.nunique(),
            'use_gpu': self.use_gpu,
            'quick_test': self.quick_test
        }
        
        unique_targets = target_ids.unique()
        
        # Limit targets for quick testing
        if max_targets and len(unique_targets) > max_targets:
            print(f"Quick test: Limiting to {max_targets} targets out of {len(unique_targets)}")
            unique_targets = unique_targets[:max_targets]
        
        for i, target_id in enumerate(unique_targets):
            if self.quick_test:
                print(f"Quick training target {target_id} ({i+1}/{len(unique_targets)})")
            else:
                print(f"Training target {target_id} ({i+1}/{len(unique_targets)})")
            
            # Filter data for this target
            mask = target_ids == target_id
            X_target = X[mask]
            y_target = y[mask]
            
            if len(X_target) == 0:
                continue
            
            # For quick test, use smaller subset
            if self.quick_test and len(X_target) > 1000:
                X_target = X_target.iloc[:1000]
                y_target = y_target.iloc[:1000]
            
            # Split data
            X_train, X_val, y_train, y_val = self.prepare_validation_data(X_target, y_target, validation_size)
            
            # Train models for this target
            self.models[target_id] = self.fit_individual_target(target_id, X_train, y_train, X_val, y_val)
        
        self.is_fitted = True
        print(f"Training completed for {len(unique_targets)} targets")
        return self

# Quick test function
def quick_test():
    """Run quick end-to-end test"""
    print("=== QUICK E2E TEST ===")
    ensemble = MultiTargetEnsemble(use_gpu=True, quick_test=True)
    
    # Use only first 2 targets for quick testing
    test_targets = training_df["target_id"].unique()[:2]
    test_mask = training_df["target_id"].isin(test_targets)
    test_data = training_df[test_mask].iloc[:500]
    
    ensemble.fit(
        X=test_data[Features2],
        y=test_data["target"],
        target_ids=test_data["target_id"],
        validation_size=0.2,
        max_targets=2
    )
    
    print("Quick test completed!")
    return ensemble

# Full training function  
def full_training():
    """Run full training"""
    print("=== FULL TRAINING ===")
    ensemble = MultiTargetEnsemble(use_gpu=True, quick_test=False)
    
    ensemble.fit(
        X=training_df[Features2],
        y=training_df["target"],
        target_ids=training_df["target_id"],
        validation_size=0.2
    )
    
    print("Full training completed!")
    return ensemble

# Run quick test first
# ensemble = quick_test()
# ensemble = full_training()  # Uncomment for full training

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path

def evaluate_with_competition_metric_fixed(model, X_test, y_test, target_ids_test, solution_template):
    """Evaluate model using the competition metric - FIXED VERSION"""
    
    try:
        # Make predictions using your working format
        predictions_df = model.predict_multi_target(X_test, target_ids_test)
        
        if predictions_df.empty:
            print("Warning: No predictions generated")
            return -1
        
        # Convert to wide format like your working code
        df_preds = predictions_df.copy()
        df_preds['row'] = df_preds.groupby('target_id').cumcount()
        
        # Pivot to wide format (90 rows × 424 columns)
        df_wide = df_preds.pivot(index='row', columns='target_id', values='prediction')
        
        # Sort columns and rename
        df_wide = df_wide.sort_index(axis=1)
        df_wide.columns = [f'target_{int(col)}' for col in df_wide.columns]
        
        # Ensure we have all 424 targets
        for i in range(424):
            col_name = f'target_{i}'
            if col_name not in df_wide.columns:
                df_wide[col_name] = 0.0
        
        # Reorder columns to match solution format
        target_cols = [f'target_{i}' for i in range(424)]
        df_wide = df_wide[target_cols]
        
        print(f"Prediction shape: {df_wide.shape}")
        
        # Prepare solution data in the same format
        # We need to extract the corresponding rows from solution_template
        unique_rows = df_wide.index.unique()
        
        # Create solution subset with same row structure
        solution_subset = solution_template.iloc[unique_rows][target_cols].reset_index(drop=True)
        
        print(f"Solution shape: {solution_subset.shape}")
        print(f"Submission shape: {df_wide.shape}")
        
        # Calculate score using your working function
        score_value = score(solution_subset, df_wide)
        print(f"Competition score: {score_value:.6f}")
        return score_value
        
    except Exception as e:
        print(f"Error calculating competition score: {e}")
        import traceback
        traceback.print_exc()
        return -1

# Alternative simple scoring for quick test
def quick_score(model, X_test, y_test, target_ids_test, solution_template, n_samples=90):
    """Simple scoring for quick testing"""
    try:
        # Use only first n_samples for quick testing
        test_subset = X_test.iloc[:n_samples]
        target_ids_subset = target_ids_test.iloc[:n_samples]
        
        # Make predictions
        predictions_df = model.predict_multi_target(test_subset, target_ids_subset)
        
        if predictions_df.empty:
            return 0.0
            
        # Convert to wide format
        df_preds = predictions_df.copy()
        df_preds['row'] = df_preds.groupby('target_id').cumcount()
        df_wide = df_preds.pivot(index='row', columns='target_id', values='prediction')
        df_wide = df_wide.sort_index(axis=1)
        df_wide.columns = [f'target_{int(col)}' for col in df_wide.columns]
        
        # Fill missing targets
        for i in range(424):
            col_name = f'target_{i}'
            if col_name not in df_wide.columns:
                df_wide[col_name] = 0.0
        
        target_cols = [f'target_{i}' for i in range(424)]
        df_wide = df_wide[target_cols]
        
        # Get corresponding solution data
        solution_subset = solution_template.iloc[:len(df_wide)][target_cols].reset_index(drop=True)
        
        # Calculate score
        return score(solution_subset, df_wide)
        
    except Exception as e:
        print(f"Quick score error: {e}")
        return 0.0

In [10]:
# Update the ensemble_predict functions to ensure they work
def ensemble_predict_from_list(models_list, X):
    """
    Original ensemble_predict function that works with a list of models.
    Handles both DataFrame and array inputs.
    """
    if hasattr(X, 'values'):
        X_values = X.values
    else:
        X_values = X
        
    preds = []
    for i, model in enumerate(models_list):
        try:
            pred = model.predict(X_values)
            preds.append(pred)
        except Exception as e:
            print(f"Model {i} prediction error: {e}")
            # Add zeros if model fails
            preds.append(np.zeros(len(X_values)))
    
    if not preds:
        print("Warning: No successful predictions")
        return np.zeros(len(X_values))
    
    ensemble_pred = np.mean(preds, axis=0)
    return ensemble_pred

def ensemble_predict(models_dict, X):
    """
    Predict using a dictionary of trained models.
    """
    # Convert models dictionary to a flat list
    models_list = []
    for target_models in models_dict.values():
        models_list.extend(list(target_models.values()))
    
    print(f"Making predictions with {len(models_list)} models")
    return ensemble_predict_from_list(models_list, X)


In [12]:
def save_ensemble(ensemble, filepath, method='joblib'):
    """Save ensemble model to disk"""
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    
    # Ensure all required attributes exist
    if not hasattr(ensemble, 'feature_names'):
        ensemble.feature_names = []
    if not hasattr(ensemble, 'training_params'):
        ensemble.training_params = {}
    if not hasattr(ensemble, 'target_cols'):
        ensemble.target_cols = [f"target_{i}" for i in range(424)]
    if not hasattr(ensemble, 'use_gpu'):
        ensemble.use_gpu = True
    
    if method == 'joblib':
        joblib.dump(ensemble, filepath)
        print(f"Ensemble saved to {filepath}")
    else:
        with open(filepath, 'wb') as f:
            pickle.dump(ensemble, f)
        print(f"Ensemble saved to {filepath}")

def load_ensemble(filepath):
    """Load ensemble model from disk"""
    if filepath.endswith('.joblib'):
        ensemble = joblib.load(filepath)
    else:
        with open(filepath, 'rb') as f:
            ensemble = pickle.load(f)
    print(f"Ensemble loaded from {filepath}")
    return ensemble

def save_models_list(models_dict, filepath):
    """Save models as a list for your original ensemble_predict function"""
    all_models = []
    for target_models in models_dict.values():
        all_models.extend(list(target_models.values()))
    
    joblib.dump(all_models, filepath)
    print(f"Models list saved to {filepath} with {len(all_models)} models")

def load_models_list(filepath):
    """Load models list for your original ensemble_predict function"""
    models = joblib.load(filepath)
    print(f"Models list loaded from {filepath} with {len(models)} models")
    return models

# Save the ensemble
# save_ensemble(ensemble, '/kaggle/working/my_ensemble.joblib')
# save_models_list(ensemble.models, '/kaggle/working/models_list.joblib')

print("Save/load functions ready!")

Save/load functions ready!


In [11]:
import polars as pl
import pandas as pd
import numpy as np

def predict_test(
    test: pl.DataFrame,
    lag1: pl.DataFrame, 
    lag2: pl.DataFrame,
    lag3: pl.DataFrame,
    lag4: pl.DataFrame,
) -> pl.DataFrame:
    """
    Predicts target values using lag features.
    Handles both Polars and Pandas DataFrame inputs.
    """
    # Convert all inputs to Pandas if they are Polars DataFrames
    if isinstance(test, pl.DataFrame):
        test_pd = test.to_pandas()
    else:
        test_pd = test.copy()
        
    if isinstance(lag1, pl.DataFrame):
        lag1_pd = lag1.to_pandas()
    else:
        lag1_pd = lag1.copy()
        
    if isinstance(lag2, pl.DataFrame):
        lag2_pd = lag2.to_pandas()
    else:
        lag2_pd = lag2.copy()
        
    if isinstance(lag3, pl.DataFrame):
        lag3_pd = lag3.to_pandas()
    else:
        lag3_pd = lag3.copy()
        
    if isinstance(lag4, pl.DataFrame):
        lag4_pd = lag4.to_pandas()
    else:
        lag4_pd = lag4.copy()
    
    print(f"Input shapes - test: {test_pd.shape}, lag1: {lag1_pd.shape}, lag2: {lag2_pd.shape}, lag3: {lag3_pd.shape}, lag4: {lag4_pd.shape}")
    
    # Load models
    try:
        Models = load_models_list('/kaggle/input/mitsuienslearning/models_list.joblib')
        print(f"Loaded {len(Models)} models for prediction")
    except:
        # Fallback: load ensemble and extract models
        ensemble = load_ensemble('/kaggle/working/my_ensemble.joblib')
        Models = []
        for target_models in ensemble.models.values():
            Models.extend(list(target_models.values()))
        print(f"Loaded {len(Models)} models from ensemble")
    
    # Combine lag features in Pandas
    X_pred = pd.concat([
        test_pd[["date_id"]],
        lag1_pd[target_lag_1],
        lag2_pd[target_lag_2], 
        lag3_pd[target_lag_3],
        lag4_pd[target_lag_4],
    ], axis=1)
    
    # If no rows, return all zeros as Polars DataFrame
    if len(X_pred) == 0:
        zero_df = pl.DataFrame({f"target_{i}": [0.0] for i in range(424)})
        print("Empty input, returning zeros")
        return zero_df
    
    # Fill nulls with 0
    X_pred = X_pred.fillna(0)
    
    # Prepare features for prediction
    n_targets = 424
    n_rows = X_pred.shape[0]
    
    print(f"Creating features for {n_rows} rows and {n_targets} targets...")
    
    # Create features for all targets efficiently
    features_array = np.tile(X_pred[Features[1:]].values, (n_targets, 1))
    target_ids = np.repeat(np.arange(n_targets), n_rows)
    
    # Create prediction DataFrame
    X_pred2 = pd.DataFrame({
        "date_id": np.tile(X_pred["date_id"].values, n_targets),
        **{feat: features_array[:, i] for i, feat in enumerate(Features[1:])},
        "target_id": target_ids,
        "row": np.tile(np.arange(n_rows), n_targets)
    })
    
    print(f"Prediction DataFrame shape: {X_pred2.shape}")
    
    # Make predictions
    preds = ensemble_predict_from_list(Models, X_pred2[Features2])
    X_pred2 = X_pred2.assign(preds=preds)
    
    print(f"Predictions completed, min: {preds.min():.4f}, max: {preds.max():.4f}, mean: {preds.mean():.4f}")
    
    # Pivot to wide format using Pandas
    df_wide = (
        X_pred2.groupby(["target_id", "row"])
        .agg({"preds": "first"})
        .reset_index()
        .pivot(index="row", columns="target_id", values="preds")
        .sort_index()
    )
    
    # Ensure correct column order and naming
    df_wide = df_wide.reindex(columns=range(424), fill_value=0)
    df_wide.columns = [f"target_{i}" for i in range(424)]
    
    print(f"Wide format shape: {df_wide.shape}")
    
    # Return last row as predictions and convert to Polars DataFrame
    result_df = df_wide.tail(1)
    result_pl = pl.from_pandas(result_df.reset_index(drop=True))
    
    print(f"Final result shape: {result_pl.shape}")
    return result_pl

# Alternative version that's more robust for competition
def predict_robust(
    test: pl.DataFrame,
    lag1: pl.DataFrame, 
    lag2: pl.DataFrame,
    lag3: pl.DataFrame,
    lag4: pl.DataFrame,
) -> pl.DataFrame:
    """
    More robust prediction function with better error handling.
    """
    try:
        return predict(test, lag1, lag2, lag3, lag4)
    except Exception as e:
        print(f"Prediction error: {e}")
        # Return default zeros in case of error
        return pl.DataFrame({f"target_{i}": [0.0] for i in range(424)})

# Test function with sample Polars data
def test_polars_prediction():
    """Test the prediction function with Polars DataFrames"""
    print("=== TESTING POLARS PREDICTION ===")
    
    # Convert sample data to Polars
    sample_test_pl = pl.from_pandas(trainl[Features].iloc[:5])
    sample_lag1_pl = pl.from_pandas(trainl[target_lag_1].iloc[:5])
    sample_lag2_pl = pl.from_pandas(trainl[target_lag_2].iloc[:5])
    sample_lag3_pl = pl.from_pandas(trainl[target_lag_3].iloc[:5])
    sample_lag4_pl = pl.from_pandas(trainl[target_lag_4].iloc[:5])
    
    print("Input types:")
    print(f"test: {type(sample_test_pl)}")
    print(f"lag1: {type(sample_lag1_pl)}")
    print(f"lag2: {type(sample_lag2_pl)}")
    print(f"lag3: {type(sample_lag3_pl)}")
    print(f"lag4: {type(sample_lag4_pl)}")
    
    # Test prediction
    result = predict_test(sample_test_pl, sample_lag1_pl, sample_lag2_pl, sample_lag3_pl, sample_lag4_pl)
    
    print(f"Result type: {type(result)}")
    print(f"Result shape: {result.shape}")
    print(f"Result columns: {result.columns[:10]}...")
    
    return result

# Run the test
test_result = test_polars_prediction()

# Final competition-ready predict function
def predict(
    test: pl.DataFrame,
    lag1: pl.DataFrame, 
    lag2: pl.DataFrame,
    lag3: pl.DataFrame,
    lag4: pl.DataFrame,
) -> pl.DataFrame:
    """
    Competition-ready prediction function with comprehensive error handling.
    """
    print("=== STARTING PREDICTION ===")
    
    try:
        # Convert all inputs to pandas
        test_pd = test.to_pandas() if isinstance(test, pl.DataFrame) else test
        lag1_pd = lag1.to_pandas() if isinstance(lag1, pl.DataFrame) else lag1
        lag2_pd = lag2.to_pandas() if isinstance(lag2, pl.DataFrame) else lag2  
        lag3_pd = lag3.to_pandas() if isinstance(lag3, pl.DataFrame) else lag3
        lag4_pd = lag4.to_pandas() if isinstance(lag4, pl.DataFrame) else lag4
        
        # Load models
        Models = load_models_list('/kaggle/input/mitsuienslearning/models_list.joblib')
        
        # Create feature matrix
        X_pred = pd.concat([
            test_pd[["date_id"]],
            lag1_pd[target_lag_1],
            lag2_pd[target_lag_2],
            lag3_pd[target_lag_3], 
            lag4_pd[target_lag_4],
        ], axis=1).fillna(0)
        
        if len(X_pred) == 0:
            return pl.DataFrame({f"target_{i}": [0.0] for i in range(424)})
        
        # Create prediction matrix for all targets
        n_rows = len(X_pred)
        features_array = np.tile(X_pred[Features[1:]].values, (424, 1))
        target_ids = np.repeat(np.arange(424), n_rows)
        
        X_pred2 = pd.DataFrame({
            "date_id": np.tile(X_pred["date_id"].values, 424),
            **{feat: features_array[:, i] for i, feat in enumerate(Features[1:])},
            "target_id": target_ids,
            "row": np.tile(np.arange(n_rows), 424)
        })
        
        # Make predictions
        preds = ensemble_predict_from_list(Models, X_pred2[Features2])
        X_pred2['preds'] = preds
        
        # Convert to wide format
        df_wide = (X_pred2.pivot_table(index='row', columns='target_id', values='preds', aggfunc='first')
                  .reindex(columns=range(424), fill_value=0)
                  .rename(columns=lambda x: f"target_{x}"))
        
        # Return last row as Polars DataFrame
        result = pl.from_pandas(df_wide.tail(1).reset_index(drop=True))
        print("=== PREDICTION COMPLETED SUCCESSFULLY ===")
        return result
        
    except Exception as e:
        print(f"Prediction failed: {e}")
        import traceback
        traceback.print_exc()
        # Return safe default
        return pl.DataFrame({f"target_{i}": [0.0] for i in range(424)})

print("Prediction block ready! Use 'competition_predict' for submission.")

=== TESTING POLARS PREDICTION ===
Input types:
test: <class 'polars.dataframe.frame.DataFrame'>
lag1: <class 'polars.dataframe.frame.DataFrame'>
lag2: <class 'polars.dataframe.frame.DataFrame'>
lag3: <class 'polars.dataframe.frame.DataFrame'>
lag4: <class 'polars.dataframe.frame.DataFrame'>
Input shapes - test: (5, 425), lag1: (5, 106), lag2: (5, 106), lag3: (5, 106), lag4: (5, 106)
Models list loaded from /kaggle/input/mitsuienslearning/models_list.joblib with 1272 models
Loaded 1272 models for prediction
Creating features for 5 rows and 424 targets...
Prediction DataFrame shape: (2120, 427)
Predictions completed, min: -0.0002, max: 0.0000, mean: -0.0002
Wide format shape: (5, 424)
Final result shape: (1, 424)
Result type: <class 'polars.dataframe.frame.DataFrame'>
Result shape: (1, 424)
Result columns: ['target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9']...
Prediction block ready! Use 'competition_predict' for submis

import numpy as np
import pandas as pd

def rank_correlation_sharpe_ratio(merged_df: pd.DataFrame) -> float:
    """
    Calculate the rank correlation Sharpe ratio for the competition metric.
    Improved with better error handling for zero denominators.
    """
    prediction_cols = [col for col in merged_df.columns if col.startswith('prediction_')]
    target_cols = [col for col in merged_df.columns if col.startswith('target_')]
    
    def _compute_rank_correlation(row):
        try:
            non_null_targets = [col for col in target_cols if not pd.isnull(row[col])]
            matching_predictions = [col for col in prediction_cols if col.replace('prediction', 'target') in non_null_targets]
            
            if len(non_null_targets) < 2:  # Need at least 2 values for correlation
                return np.nan
            
            target_vals = row[non_null_targets].values
            pred_vals = row[matching_predictions].values
            
            # Check for zero variance
            if np.std(target_vals, ddof=0) == 0 or np.std(pred_vals, ddof=0) == 0:
                return np.nan
            
            # Calculate rank correlation
            target_ranks = pd.Series(target_vals).rank(method='average')
            pred_ranks = pd.Series(pred_vals).rank(method='average')
            
            correlation = np.corrcoef(pred_ranks, target_ranks)[0, 1]
            return correlation
            
        except Exception as e:
            return np.nan
    
    # Calculate daily rank correlations, ignoring NaN values
    daily_rank_corrs = merged_df.apply(_compute_rank_correlation, axis=1)
    valid_corrs = daily_rank_corrs.dropna()
    
    if len(valid_corrs) == 0:
        return 0.0  # Return 0 instead of error if no valid correlations
    
    std_dev = valid_corrs.std(ddof=0)
    if std_dev == 0:
        return 0.0  # Return 0 instead of error if zero standard deviation
    
    sharpe_ratio = valid_corrs.mean() / std_dev
    return float(sharpe_ratio)

def score(solution: pd.DataFrame, submission: pd.DataFrame) -> float:
    """
    Competition scoring function with improved error handling.
    """
    try:
        # Ensure both dataframes have the same columns
        assert all(solution.columns == submission.columns)
        
        # Create copies to avoid modifying originals
        submission_copy = submission.copy()
        solution_copy = solution.copy()
        
        # Rename columns for the scoring function
        submission_copy = submission_copy.rename(columns={
            col: col.replace('target_', 'prediction_') for col in submission_copy.columns
        })
        
        # Replace zeros with None as in original
        solution_copy = solution_copy.replace(0, None)
        
        # Merge and calculate score
        merged_df = pd.concat([solution_copy, submission_copy], axis=1)
        return rank_correlation_sharpe_ratio(merged_df)
        
    except Exception as e:
        print(f"Error in score function: {e}")
        return 0.0  # Return 0 instead of -1 for errors

def robust_score(solution: pd.DataFrame, submission: pd.DataFrame, verbose=True) -> float:
    """
    Robust version of the scoring function with comprehensive error handling.
    """
    try:
        if verbose:
            print(f"Input shapes - Solution: {solution.shape}, Submission: {submission.shape}")
        
        # Ensure both dataframes have exactly 424 target columns
        required_cols = [f'target_{i}' for i in range(424)]
        
        # Add missing columns with zeros
        for col in required_cols:
            if col not in submission.columns:
                submission[col] = 0.0
            if col not in solution.columns:
                solution[col] = 0.0
        
        # Reorder columns to ensure consistency
        submission = submission[required_cols]
        solution = solution[required_cols]
        
        if verbose:
            print(f"After alignment - Solution: {solution.shape}, Submission: {submission.shape}")
            print(f"Sample data - Solution mean: {solution.mean().mean():.4f}, Submission mean: {submission.mean().mean():.4f}")
        
        # Remove rows where all targets are zero or NaN (no information)
        solution_non_zero = solution.replace(0, np.nan).dropna(how='all')
        if len(solution_non_zero) == 0:
            if verbose:
                print("Warning: All solution values are zero")
            return 0.0
        
        # Use only rows that have non-zero solutions
        valid_indices = solution_non_zero.index
        solution_filtered = solution.loc[valid_indices]
        submission_filtered = submission.loc[valid_indices]
        
        if verbose:
            print(f"After filtering zeros - Solution: {solution_filtered.shape}, Submission: {submission_filtered.shape}")
        
        if len(solution_filtered) == 0:
            if verbose:
                print("Warning: No valid rows after filtering")
            return 0.0
        
        # Calculate score
        score_value = score(solution_filtered, submission_filtered)
        
        if verbose:
            print(f"Final score: {score_value:.6f}")
        
        return score_value
        
    except Exception as e:
        print(f"Error in robust_score: {e}")
        return 0.0

def debug_data_issues(solution: pd.DataFrame, submission: pd.DataFrame):
    """
    Debug function to identify data issues that might cause scoring problems.
    """
    print("=== DATA DEBUG INFO ===")
    print(f"Solution shape: {solution.shape}")
    print(f"Submission shape: {submission.shape}")
    print(f"Solution columns: {len(solution.columns)}")
    print(f"Submission columns: {len(submission.columns)}")
    
    # Check for all zeros
    solution_zeros = (solution == 0).all(axis=1).sum()
    submission_zeros = (submission == 0).all(axis=1).sum()
    print(f"Rows with all zeros - Solution: {solution_zeros}, Submission: {submission_zeros}")
    
    # Check for NaN values
    solution_nans = solution.isna().sum().sum()
    submission_nans = submission.isna().sum().sum()
    print(f"NaN values - Solution: {solution_nans}, Submission: {submission_nans}")
    
    # Check variance
    solution_var = solution.var(axis=1)
    submission_var = submission.var(axis=1)
    print(f"Zero variance rows - Solution: {(solution_var == 0).sum()}, Submission: {(submission_var == 0).sum()}")
    
    # Check data ranges
    print(f"Solution range: [{solution.min().min():.4f}, {solution.max().max():.4f}]")
    print(f"Submission range: [{submission.min().min():.4f}, {submission.max().max():.4f}]")

# Test the scoring with your predictions
def test_scoring_with_predictions():
    """Test scoring with the generated predictions"""
    print("=== TESTING SCORING ===")
    
    # Make predictions
    X_data = X_train.copy()
    X_data["preds"] = ensemble_predict(ensemble.models, X_train)
    
    # Convert to wide format
    df_preds = X_data.copy()
    df_preds['row'] = df_preds.groupby('target_id').cumcount()
    df_wide = df_preds.pivot(index='row', columns='target_id', values='preds')
    df_wide = df_wide.sort_index(axis=1)
    df_wide.columns = [f'target_{int(col)}' for col in df_wide.columns]
    
    # Ensure all 424 targets
    for i in range(424):
        col_name = f'target_{i}'
        if col_name not in df_wide.columns:
            df_wide[col_name] = 0.0
    
    target_cols = [f'target_{i}' for i in range(424)]
    df_wide = df_wide[target_cols]
    
    print(f"Prediction shape: {df_wide.shape}")
    
    # Prepare solution data
    solution_template = trainl.copy().set_index('date_id')
    solution_template = solution_template.rename(columns={
        col: f"target_{int(col.split('_')[1])}" if col.startswith('target_') else col 
        for col in solution_template.columns
    })
    
    # Use first 90 rows for scoring (matching competition format)
    solution_subset = solution_template.iloc[:90][target_cols].reset_index(drop=True)
    prediction_subset = df_wide.iloc[:90].reset_index(drop=True)
    
    # Debug data issues
    debug_data_issues(solution_subset, prediction_subset)
    
    # Calculate score with robust function
    score_value = robust_score(solution_subset, prediction_subset, verbose=True)
    print(f"FINAL SCORE: {score_value:.6f}")
    return score_value

# Quick validation score function
def calculate_validation_score(ensemble, validation_size=0.1):
    """
    Calculate validation score on a holdout set.
    """
    print("\n=== CALCULATING VALIDATION SCORE ===")
    
    # Create validation set (last portion of data)
    n_samples = len(training_df)
    val_start = int(n_samples * (1 - validation_size))
    
    X_val = training_df[Features2].iloc[val_start:]
    y_val = training_df["target"].iloc[val_start:]
    target_ids_val = training_df["target_id"].iloc[val_start:]
    
    print(f"Validation set size: {len(X_val)}")
    
    # Make predictions
    val_preds = ensemble_predict(ensemble.models, X_val)
    
    # Convert to wide format
    val_data = X_val.copy()
    val_data['preds'] = val_preds
    val_data['row'] = val_data.groupby('target_id').cumcount()
    
    df_val_wide = val_data.pivot(index='row', columns='target_id', values='preds')
    df_val_wide = df_val_wide.sort_index(axis=1)
    df_val_wide.columns = [f'target_{int(col)}' for col in df_val_wide.columns]
    
    # Ensure all targets
    for i in range(424):
        col_name = f'target_{i}'
        if col_name not in df_val_wide.columns:
            df_val_wide[col_name] = 0.0
    
    target_cols = [f'target_{i}' for i in range(424)]
    df_val_wide = df_val_wide[target_cols]
    
    # Prepare solution data for validation period
    solution_template = trainl.copy().set_index('date_id')
    solution_template = solution_template.rename(columns={
        col: f"target_{int(col.split('_')[1])}" if col.startswith('target_') else col 
        for col in solution_template.columns
    })
    
    # Use corresponding rows for validation scoring
    val_solution = solution_template.iloc[-len(df_val_wide):][target_cols].reset_index(drop=True)
    val_predictions = df_val_wide.reset_index(drop=True)
    
    # Debug validation data
    debug_data_issues(val_solution, val_predictions)
    
    # Calculate validation score
    val_score = robust_score(val_solution, val_predictions, verbose=True)
    print(f"VALIDATION SCORE: {val_score:.6f}")
    
    return val_score

# Run scoring tests
print("Running comprehensive scoring tests...")
final_score = test_scoring_with_predictions()
validation_score = calculate_validation_score(ensemble)

print(f"\n=== FINAL SUMMARY ===")
print(f"Training completed for {len(ensemble.models)} targets")
print(f"Test score: {final_score:.6f}")
print(f"Validation score: {validation_score:.6f}")

# Additional diagnostic: Check if we need more targets trained
if len(ensemble.models) < 10:
    print(f"\n⚠️  WARNING: Only {len(ensemble.models)} targets trained. Consider training more targets for better scores.")
else:
    print(f"✓ Good: {len(ensemble.models)} targets trained")

In [None]:
# submission through the API
import os
import kaggle_evaluation.mitsui_inference_server

inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print('there')
    # inference_server.serve()
else:
    print('here')
    inference_server.run_local_gateway(('/kaggle/input/mitsui-commodity-prediction-challenge/',))

In [16]:
display(pl.read_parquet('/kaggle/working/submission.parquet'))

date_id,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17,target_18,target_19,target_20,target_21,target_22,target_23,target_24,target_25,target_26,target_27,target_28,target_29,target_30,target_31,target_32,target_33,target_34,target_35,…,target_387,target_388,target_389,target_390,target_391,target_392,target_393,target_394,target_395,target_396,target_397,target_398,target_399,target_400,target_401,target_402,target_403,target_404,target_405,target_406,target_407,target_408,target_409,target_410,target_411,target_412,target_413,target_414,target_415,target_416,target_417,target_418,target_419,target_420,target_421,target_422,target_423
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1827,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,…,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068
1828,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,…,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068,-0.000068
1829,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,…,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136,-0.000136
1830,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,…,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235,-0.000235
1831,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,…,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026,-0.000026
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1956,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,…,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014
1957,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,…,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012,-0.000012
1958,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,…,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016
1959,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,…,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016
