In [None]:
# importing the useful libraries for the work
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, log_loss, classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from scipy import stats
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# ============================================================================
# 1. USER CONFIGURATION
# ============================================================================

TRAIN_PATH = "/kaggle/input/mock-test-2-mse-2/train.csv"
TEST_PATH = "/kaggle/input/mock-test-2-mse-2/test.csv"

# TARGET_COL can be a string (single target) or list of strings (multiple targets)
TARGET_COL = "Status"  # Change to ["Class1", "Class2"] for multiple targets
# Example: TARGET_COL = ["target1", "target2", "target3"]

ID_COL = "id"
OUTPUT_FILE = "submission.csv"

# Toggle heavy steps
DO_PLOTTING = True
DO_OUTLIER_CAP = False
DO_HYPERPARAM_TUNING = True
RANDOM_STATE = 42


# ============================================================================
# 2. LOAD DATA
# ============================================================================

print("="*80)
print("LOADING DATA")
print("="*80)

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"\nTrain shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

# Keep test IDs for submission
test_ids = test_data[ID_COL].copy() if ID_COL in test_data.columns else None

# Drop ID cols from feature tables
if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])


# ============================================================================
# 3. DETECT TARGET CONFIGURATION (Single vs Multiple Targets)
# ============================================================================

print("\n" + "="*80)
print("TARGET CONFIGURATION DETECTION")
print("="*80)

# Normalize TARGET_COL to always be a list
if isinstance(TARGET_COL, str):
    target_cols = [TARGET_COL]
    is_multi_target = False
    print(f"\n✓ Single target mode: '{TARGET_COL}'")
elif isinstance(TARGET_COL, list):
    target_cols = TARGET_COL
    is_multi_target = True
    print(f"\n✓ Multi-target mode: {len(target_cols)} targets")
    print(f"  Targets: {target_cols}")
else:
    raise ValueError("TARGET_COL must be a string or list of strings")

# Verify all target columns exist
missing_targets = [col for col in target_cols if col not in train_data.columns]
if missing_targets:
    raise ValueError(f"Target columns not found in train data: {missing_targets}")

print(f"\nNumber of targets: {len(target_cols)}")


# ============================================================================
# 4. DATA CLEANING
# ============================================================================

print("\n" + "="*80)
print("DATA CLEANING")
print("="*80)

# Basic info
print("\nTrain data info:")
print(train_data.info())
print("\nTest data info:")
print(test_data.info())

# Duplicates
train_dups = train_data.duplicated().sum()
print(f"\nDuplicate rows in train: {train_dups}")
if train_dups > 0:
    print("Dropping duplicate rows from train.")
    train_data = train_data.drop_duplicates().reset_index(drop=True)

# Missing value summary
print("\n" + "-"*80)
print("MISSING VALUES ANALYSIS")
print("-"*80)

train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

print("\nMissing values in TRAIN:")
if train_missing.sum() == 0:
    print("  No missing values!")
else:
    print(train_missing[train_missing > 0])
    print(f"\nTotal missing: {train_missing.sum()}")
    print(f"Percentage: {(train_missing.sum() / (train_data.shape[0] * train_data.shape[1]) * 100):.2f}%")

print("\nMissing values in TEST:")
if test_missing.sum() == 0:
    print("  No missing values!")
else:
    print(test_missing[test_missing > 0])
    print(f"\nTotal missing: {test_missing.sum()}")
    print(f"Percentage: {(test_missing.sum() / (test_data.shape[0] * test_data.shape[1]) * 100):.2f}%")

# Detect columns with excessive missing values
high_missing_cols = train_missing[train_missing > len(train_data) * 0.5].index.tolist()
if high_missing_cols:
    print(f"\n⚠ WARNING: Columns with >50% missing values: {high_missing_cols}")


# ============================================================================
# 5. TARGET VARIABLE ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("TARGET VARIABLE ANALYSIS")
print("="*80)

# Keep a copy for visualization
train_viz = train_data.copy()

# Analyze each target column
target_info = {}
for target_col in target_cols:
    print(f"\n{'-'*80}")
    print(f"Target: {target_col}")
    print(f"{'-'*80}")

    # Check for NaN in target
    nan_count = train_data[target_col].isnull().sum()
    if nan_count > 0:
        print(f"⚠ WARNING: {nan_count} NaN values in target '{target_col}'")

    # Get unique classes
    unique_classes = train_data[target_col].dropna().unique()
    n_classes = len(unique_classes)

    print(f"Number of classes: {n_classes}")
    print(f"Classes: {sorted(unique_classes)}")

    # Class distribution
    value_counts = train_data[target_col].value_counts()
    value_props = train_data[target_col].value_counts(normalize=True)

    print("\nClass distribution:")
    for cls in sorted(unique_classes):
        count = value_counts.get(cls, 0)
        prop = value_props.get(cls, 0)
        print(f"  {cls}: {count} ({prop*100:.2f}%)")

    # Check for severe imbalance
    if n_classes >= 2:
        min_prop = value_props.min()
        if min_prop < 0.05:
            print(f"\n⚠ Severe class imbalance detected! Minimum class: {min_prop*100:.2f}%")

    # Store info
    target_info[target_col] = {
        'n_classes': n_classes,
        'classes': sorted(unique_classes),
        'nan_count': nan_count
    }

# Plot target distributions
if DO_PLOTTING:
    n_targets = len(target_cols)
    fig, axes = plt.subplots(1, n_targets, figsize=(6*n_targets, 5))
    if n_targets == 1:
        axes = [axes]

    for idx, target_col in enumerate(target_cols):
        train_viz[target_col].value_counts().plot(kind='bar', ax=axes[idx])
        axes[idx].set_title(f"Distribution of '{target_col}'")
        axes[idx].set_xlabel("Class")
        axes[idx].set_ylabel("Count")
        axes[idx].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()


# ============================================================================
# 6. SEPARATE FEATURES AND TARGETS
# ============================================================================

print("\n" + "="*80)
print("FEATURE EXTRACTION")
print("="*80)

# Separate X and y
X = train_data.drop(columns=target_cols)
y = train_data[target_cols]

# Handle NaN in target variables
total_nan = y.isnull().any(axis=1).sum()
if total_nan > 0:
    print(f"\n⚠ Removing {total_nan} rows with NaN in target variable(s)")
    valid_idx = ~y.isnull().any(axis=1)
    X = X[valid_idx].reset_index(drop=True)
    y = y[valid_idx].reset_index(drop=True)
    print(f"New training shape: {X.shape}")

# For single target, convert to Series
if not is_multi_target:
    y = y.iloc[:, 0]

# Feature type detection
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nFeature summary:")
print(f"  Total features: {X.shape[1]}")
print(f"  Categorical: {len(cat_cols)}")
print(f"  Numerical: {len(num_cols)}")

if cat_cols:
    print(f"\nCategorical columns: {cat_cols}")
if num_cols:
    print(f"\nNumerical columns: {num_cols}")


# ============================================================================
# 7. EXPLORATORY DATA ANALYSIS
# ============================================================================

if DO_PLOTTING:
    print("\n" + "="*80)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*80)

    # Numerical features - histograms
    if len(num_cols) > 0:
        print("\nGenerating histograms for numerical features...")
        n_num = len(num_cols)
        ncols = 3
        nrows = (n_num + ncols - 1) // ncols
        fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows))
        axes = axes.flatten() if n_num > 1 else [axes]

        for i, col in enumerate(num_cols):
            sns.histplot(X[col].dropna(), kde=True, ax=axes[i])
            axes[i].set_title(col)

        # Hide empty subplots
        for i in range(n_num, len(axes)):
            axes[i].axis('off')

        plt.tight_layout()
        plt.show()

        # Boxplots
        print("\nGenerating boxplots for numerical features...")
        fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4*nrows))
        axes = axes.flatten() if n_num > 1 else [axes]

        for i, col in enumerate(num_cols):
            sns.boxplot(x=X[col].dropna(), ax=axes[i])
            axes[i].set_title(f"Boxplot: {col}")

        for i in range(n_num, len(axes)):
            axes[i].axis('off')

        plt.tight_layout()
        plt.show()

        # Correlation heatmap
        if n_num > 1:
            print("\nGenerating correlation heatmap...")
            plt.figure(figsize=(12, 10))
            corr = X[num_cols].corr()
            sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
            plt.title("Correlation Matrix (Numerical Features)")
            plt.tight_layout()
            plt.show()

    # Categorical features - count plots
    if len(cat_cols) > 0:
        print("\nGenerating count plots for categorical features...")
        for col in cat_cols[:5]:  # Limit to first 5 to avoid too many plots
            plt.figure(figsize=(10, 6))
            top_categories = X[col].value_counts().head(20)
            sns.barplot(x=top_categories.values, y=top_categories.index)
            plt.title(f"Top 20 Categories in '{col}'")
            plt.xlabel("Count")
            plt.tight_layout()
            plt.show()


# ============================================================================
# 8. OUTLIER DETECTION
# ============================================================================

print("\n" + "="*80)
print("OUTLIER ANALYSIS")
print("="*80)

outlier_summary = {}
for col in num_cols:
    q1 = X[col].quantile(0.25)
    q3 = X[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    n_lower = (X[col] < lower).sum()
    n_upper = (X[col] > upper).sum()
    outlier_summary[col] = {
        "lower": lower,
        "upper": upper,
        "n_lower": int(n_lower),
        "n_upper": int(n_upper)
    }

print("\nOutlier summary (IQR method):")
for col, s in outlier_summary.items():
    total_outliers = s['n_lower'] + s['n_upper']
    pct = (total_outliers / len(X)) * 100
    print(f"  {col}: {s['n_lower']} below, {s['n_upper']} above ({pct:.2f}%)")

# Outlier capping
if DO_OUTLIER_CAP:
    print("\n✓ Capping outliers using IQR thresholds...")
    for col, s in outlier_summary.items():
        X[col] = X[col].clip(lower=s['lower'], upper=s['upper'])
        if col in test_data.columns:
            test_data[col] = test_data[col].clip(lower=s['lower'], upper=s['upper'])
    print("Outlier capping completed.")


# ============================================================================
# 9. MISSING VALUE IMPUTATION
# ============================================================================

print("\n" + "="*80)
print("MISSING VALUE IMPUTATION")
print("="*80)

# Numerical imputation
if num_cols:
    print("\n✓ Imputing numerical columns with MEDIAN...")
    num_imputer = SimpleImputer(strategy='median')
    X[num_cols] = num_imputer.fit_transform(X[num_cols])
    test_data[num_cols] = num_imputer.transform(test_data[num_cols])

# Categorical imputation
if cat_cols:
    print("✓ Imputing categorical columns with MOST_FREQUENT...")
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
    test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])

# Verification
train_missing_after = X.isnull().sum().sum()
test_missing_after = test_data.isnull().sum().sum()

print(f"\nPost-imputation verification:")
print(f"  Train missing values: {train_missing_after}")
print(f"  Test missing values: {test_missing_after}")

if train_missing_after > 0 or test_missing_after > 0:
    print("\n⚠ WARNING: Missing values still present!")


# ============================================================================
# 10. PREPROCESSING PIPELINE
# ============================================================================

print("\n" + "="*80)
print("PREPROCESSING PIPELINE")
print("="*80)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'
)

print("✓ Preprocessing pipeline created")


# ============================================================================
# 11. TRAIN/VALIDATION SPLIT
# ============================================================================

print("\n" + "="*80)
print("TRAIN/VALIDATION SPLIT")
print("="*80)

# Stratify only for single-target classification
stratify_param = y if not is_multi_target else None

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=RANDOM_STATE,
    stratify=stratify_param
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

if not is_multi_target:
    print("\nTrain set class distribution:")
    print(y_train.value_counts())
    print("\nValidation set class distribution:")
    print(y_val.value_counts())

# Transform features
print("\n✓ Transforming features...")
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)

print(f"  Train transformed: {X_train_pre.shape}")
print(f"  Validation transformed: {X_val_pre.shape}")
print(f"  Test transformed: {test_data_pre.shape}")


# ============================================================================
# 12. LABEL ENCODING (for single-target only)
# ============================================================================

print("\n" + "="*80)
print("LABEL ENCODING")
print("="*80)

if not is_multi_target:
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)

    print(f"Number of classes: {len(le.classes_)}")
    print(f"Classes: {le.classes_}")
else:
    # For multi-target, encode each target separately
    label_encoders = {}
    y_train_enc = pd.DataFrame(index=y_train.index)
    y_val_enc = pd.DataFrame(index=y_val.index)

    for target_col in target_cols:
        le_temp = LabelEncoder()
        y_train_enc[target_col] = le_temp.fit_transform(y_train[target_col])
        y_val_enc[target_col] = le_temp.transform(y_val[target_col])
        label_encoders[target_col] = le_temp

        print(f"\nTarget '{target_col}':")
        print(f"  Classes: {le_temp.classes_}")

    # Convert to numpy arrays
    y_train_enc = y_train_enc.values
    y_val_enc = y_val_enc.values


# ============================================================================
# 13. MODEL TRAINING
# ============================================================================

print("\n" + "="*80)
print("MODEL TRAINING")
print("="*80)

if is_multi_target:
    print("\n✓ Training Multi-Output RandomForest...")
    base_rf = RandomForestClassifier(
        n_estimators=1000,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1,
        max_depth=20,
        min_samples_split=5
    )
    baseline_model = MultiOutputClassifier(base_rf, n_jobs=-1)
else:
    print("\n✓ Training Single-Output RandomForest...")
    baseline_model = RandomForestClassifier(
        n_estimators=1000,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1,
        max_depth=20,
        min_samples_split=5
    )

baseline_model.fit(X_train_pre, y_train_enc)
print("✓ Baseline model training completed")


# ============================================================================
# 14. MODEL EVALUATION
# ============================================================================

print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

# Predictions
y_train_pred = baseline_model.predict(X_train_pre)
y_val_pred = baseline_model.predict(X_val_pre)

if is_multi_target:
    # Multi-target evaluation
    print("\n✓ Multi-Target Classification Metrics:")
    print(f"\nTraining Set:")
    for idx, target_col in enumerate(target_cols):
        acc = accuracy_score(y_train_enc[:, idx], y_train_pred[:, idx])
        f1 = f1_score(y_train_enc[:, idx], y_train_pred[:, idx], average='macro')
        print(f"  {target_col}: Accuracy={acc:.4f}, F1={f1:.4f}")

    print(f"\nValidation Set:")
    for idx, target_col in enumerate(target_cols):
        acc = accuracy_score(y_val_enc[:, idx], y_val_pred[:, idx])
        f1 = f1_score(y_val_enc[:, idx], y_val_pred[:, idx], average='macro')
        print(f"  {target_col}: Accuracy={acc:.4f}, F1={f1:.4f}")

        # Detailed report for each target
        print(f"\n  Classification Report for '{target_col}':")
        le_temp = label_encoders[target_col]
        print(classification_report(y_val_enc[:, idx], y_val_pred[:, idx],
                                   target_names=le_temp.classes_, zero_division=0))
else:
    # Single-target evaluation
    n_classes = len(le.classes_)

    # Get probabilities for ROC AUC
    try:
        train_proba = baseline_model.predict_proba(X_train_pre)
        val_proba = baseline_model.predict_proba(X_val_pre)

        if n_classes == 2:
            roc_train = roc_auc_score(y_train_enc, train_proba[:, 1])
            roc_val = roc_auc_score(y_val_enc, val_proba[:, 1])
        else:
            roc_train = roc_auc_score(y_train_enc, train_proba,
                                     multi_class='ovr', average='macro')
            roc_val = roc_auc_score(y_val_enc, val_proba,
                                   multi_class='ovr', average='macro')

        loss_train = log_loss(y_train_enc, train_proba)
        loss_val = log_loss(y_val_enc, val_proba)

        print(f"\nTraining Metrics:")
        print(f"  ROC AUC: {roc_train:.4f}")
        print(f"  Log Loss: {loss_train:.4f}")

        print(f"\nValidation Metrics:")
        print(f"  ROC AUC: {roc_val:.4f}")
        print(f"  Log Loss: {loss_val:.4f}")
    except:
        print("\nCould not compute probability-based metrics")

    # Classification report
    print("\n" + "-"*80)
    print("Validation Classification Report:")
    print("-"*80)
    print(classification_report(y_val_enc, y_val_pred,
                               target_names=le.classes_, zero_division=0))


# ============================================================================
# 15. HYPERPARAMETER TUNING (Optional)
# ============================================================================

tuned_model = None
if DO_HYPERPARAM_TUNING:
    print("\n" + "="*80)
    print("HYPERPARAMETER TUNING")
    print("="*80)

    param_dist = {
        "estimator__n_estimators" if is_multi_target else "n_estimators": [200, 500, 800, 1000],
        "estimator__max_depth" if is_multi_target else "max_depth": [None, 10, 15, 20, 25],
        "estimator__min_samples_split" if is_multi_target else "min_samples_split": [2, 5, 10],
        "estimator__min_samples_leaf" if is_multi_target else "min_samples_leaf": [1, 2, 4],
        "estimator__max_features" if is_multi_target else "max_features": ["sqrt", "log2", 0.3]
    }

    # Choose scoring
    if is_multi_target:
        scoring = 'accuracy'  # or custom scorer for multi-target
    else:
        scoring = 'roc_auc_ovr' if n_classes > 2 else 'roc_auc'

    print(f"\n✓ Starting RandomizedSearchCV (n_iter=20)...")

    if is_multi_target:
        base_estimator = MultiOutputClassifier(
            RandomForestClassifier(random_state=RANDOM_STATE,
                                 class_weight='balanced', n_jobs=-1),
            n_jobs=-1
        )
    else:
        base_estimator = RandomForestClassifier(
            random_state=RANDOM_STATE,
            class_weight='balanced',
            n_jobs=-1
        )

    rnd_search = RandomizedSearchCV(
        estimator=base_estimator,
        param_distributions=param_dist,
        n_iter=20,
        scoring=scoring,
        cv=3,
        verbose=1,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    rnd_search.fit(X_train_pre, y_train_enc)

    print("\n✓ Best parameters:")
    for param, value in rnd_search.best_params_.items():
        print(f"  {param}: {value}")
    print(f"\nBest CV score: {rnd_search.best_score_:.4f}")

    tuned_model = rnd_search.best_estimator_

    # Evaluate tuned model
    y_val_pred_tuned = tuned_model.predict(X_val_pre)

    if is_multi_target:
        print("\nTuned Model - Validation Metrics:")
        for idx, target_col in enumerate(target_cols):
            acc = accuracy_score(y_val_enc[:, idx], y_val_pred_tuned[:, idx])
            f1 = f1_score(y_val_enc[:, idx], y_val_pred_tuned[:, idx], average='macro')
            print(f"  {target_col}: Accuracy={acc:.4f}, F1={f1:.4f}")
    else:
        try:
            val_proba_tuned = tuned_model.predict_proba(X_val_pre)
            if n_classes == 2:
                roc_val_tuned = roc_auc_score(y_val_enc, val_proba_tuned[:, 1])
            else:
                roc_val_tuned = roc_auc_score(y_val_enc, val_proba_tuned,
                                             multi_class='ovr', average='macro')
            loss_val_tuned = log_loss(y_val_enc, val_proba_tuned)

            print(f"\nTuned Model - Validation Metrics:")
            print(f"  ROC AUC: {roc_val_tuned:.4f}")
            print(f"  Log Loss: {loss_val_tuned:.4f}")
        except:
            print("\nCould not compute probability metrics for tuned model")


# ============================================================================
# 16. CONFUSION MATRIX VISUALIZATION
# ============================================================================

if DO_PLOTTING and not is_multi_target:
    print("\n" + "="*80)
    print("CONFUSION MATRIX")
    print("="*80)

    final_model = tuned_model if tuned_model is not None else baseline_model
    y_val_pred_final = final_model.predict(X_val_pre)

    cm = confusion_matrix(y_val_enc, y_val_pred_final)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title('Confusion Matrix - Validation Set')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()


# ============================================================================
# 17. GENERATE PREDICTIONS
# ============================================================================

print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

final_model = tuned_model if tuned_model is not None else baseline_model
print(f"Using {'TUNED' if tuned_model is not None else 'BASELINE'} model")

# Make predictions
test_pred_enc = final_model.predict(test_data_pre)

# Decode predictions
if is_multi_target:
    test_pred_decoded = pd.DataFrame(index=range(len(test_pred_enc)))
    for idx, target_col in enumerate(target_cols):
        le_temp = label_encoders[target_col]
        test_pred_decoded[target_col] = le_temp.inverse_transform(test_pred_enc[:, idx])
else:
    test_pred_decoded = le.inverse_transform(test_pred_enc)

# Create submission
if test_ids is not None:
    if is_multi_target:
        submission_df = pd.DataFrame({ID_COL: test_ids})
        for target_col in target_cols:
            submission_df[target_col] = test_pred_decoded[target_col].values
    else:
        submission_df = pd.DataFrame({
            ID_COL: test_ids,
            target_cols[0]: test_pred_decoded
        })
else:
    if is_multi_target:
        submission_df = test_pred_decoded.copy()
    else:
        submission_df = pd.DataFrame({target_cols[0]: test_pred_decoded})

# Save submission
submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"\n✓ Submission saved to '{OUTPUT_FILE}'")

# Display statistics
print("\n" + "-"*80)
print("SUBMISSION STATISTICS")
print("-"*80)
print(f"Total predictions: {len(submission_df)}")

if is_multi_target:
    for target_col in target_cols:
        print(f"\nPrediction distribution for '{target_col}':")
        print(submission_df[target_col].value_counts())
else:
    print("\nPrediction distribution:")
    print(submission_df[target_cols[0]].value_counts())

print("\nSubmission preview:")
print(submission_df.head(10))

print("\n" + "="*80)
print("✓ PROCESS COMPLETED SUCCESSFULLY!")
print("="*80)