In [None]:
# importing the useful libraries for the work
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, log_loss, classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from scipy import stats
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# ============================================================================
# 1. USER CONFIGURATION
# ============================================================================

TRAIN_PATH = "/kaggle/input/mock-test-2-mse-2/train.csv"
TEST_PATH = "/kaggle/input/mock-test-2-mse-2/test.csv"

TARGET_COL = "Status"
ID_COL = "id"
OUTPUT_FILE = "submission.csv"

# Toggle heavy steps
DO_PLOTTING = False  # Disabled for speed
DO_OUTLIER_CAP = False
DO_HYPERPARAM_TUNING = True
DO_MODEL_COMPARISON = True
DO_ENSEMBLE = True
RANDOM_STATE = 42

# Speed optimization settings
FAST_MODE = True  # Enable fast mode
N_JOBS = -1  # Use all CPU cores


# ============================================================================
# 2. LOAD DATA
# ============================================================================

print("="*80)
print("LOADING DATA")
print("="*80)

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"\nTrain shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

test_ids = test_data[ID_COL].copy() if ID_COL in test_data.columns else None

if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])


# ============================================================================
# 3. DETECT TARGET CONFIGURATION
# ============================================================================

print("\n" + "="*80)
print("TARGET CONFIGURATION DETECTION")
print("="*80)

if isinstance(TARGET_COL, str):
    target_cols = [TARGET_COL]
    is_multi_target = False
    print(f"\n‚úì Single target mode: '{TARGET_COL}'")
elif isinstance(TARGET_COL, list):
    target_cols = TARGET_COL
    is_multi_target = True
    print(f"\n‚úì Multi-target mode: {len(target_cols)} targets")
else:
    raise ValueError("TARGET_COL must be a string or list of strings")

missing_targets = [col for col in target_cols if col not in train_data.columns]
if missing_targets:
    raise ValueError(f"Target columns not found: {missing_targets}")


# ============================================================================
# 4. DATA CLEANING
# ============================================================================

print("\n" + "="*80)
print("DATA CLEANING")
print("="*80)

train_dups = train_data.duplicated().sum()
print(f"\nDuplicate rows in train: {train_dups}")
if train_dups > 0:
    train_data = train_data.drop_duplicates().reset_index(drop=True)

train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

print("\nMissing values in TRAIN:")
if train_missing.sum() == 0:
    print("  No missing values!")
else:
    print(train_missing[train_missing > 0])


# ============================================================================
# 5. TARGET VARIABLE ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("TARGET VARIABLE ANALYSIS")
print("="*80)

for target_col in target_cols:
    print(f"\nTarget: {target_col}")
    unique_classes = train_data[target_col].dropna().unique()
    print(f"Number of classes: {len(unique_classes)}")
    print(f"Classes: {sorted(unique_classes)}")
    print("\nClass distribution:")
    print(train_data[target_col].value_counts())


# ============================================================================
# 6. SEPARATE FEATURES AND TARGETS
# ============================================================================

print("\n" + "="*80)
print("FEATURE EXTRACTION")
print("="*80)

X = train_data.drop(columns=target_cols)
y = train_data[target_cols]

total_nan = y.isnull().any(axis=1).sum()
if total_nan > 0:
    valid_idx = ~y.isnull().any(axis=1)
    X = X[valid_idx].reset_index(drop=True)
    y = y[valid_idx].reset_index(drop=True)

if not is_multi_target:
    y = y.iloc[:, 0]

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nFeature summary:")
print(f"  Total features: {X.shape[1]}")
print(f"  Categorical: {len(cat_cols)}")
print(f"  Numerical: {len(num_cols)}")


# ============================================================================
# 7. OUTLIER DETECTION & HANDLING
# ============================================================================

print("\n" + "="*80)
print("OUTLIER ANALYSIS")
print("="*80)

outlier_summary = {}
for col in num_cols:
    q1 = X[col].quantile(0.25)
    q3 = X[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    n_lower = (X[col] < lower).sum()
    n_upper = (X[col] > upper).sum()
    outlier_summary[col] = {"lower": lower, "upper": upper, "n_lower": int(n_lower), "n_upper": int(n_upper)}

if DO_OUTLIER_CAP:
    print("\n‚úì Capping outliers...")
    for col, s in outlier_summary.items():
        X[col] = X[col].clip(lower=s['lower'], upper=s['upper'])
        if col in test_data.columns:
            test_data[col] = test_data[col].clip(lower=s['lower'], upper=s['upper'])


# ============================================================================
# 8. MISSING VALUE IMPUTATION
# ============================================================================

print("\n" + "="*80)
print("MISSING VALUE IMPUTATION")
print("="*80)

if num_cols:
    num_imputer = SimpleImputer(strategy='median')
    X[num_cols] = num_imputer.fit_transform(X[num_cols])
    test_data[num_cols] = num_imputer.transform(test_data[num_cols])

if cat_cols:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
    test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])


# ============================================================================
# 9. PREPROCESSING PIPELINE
# ============================================================================

print("\n" + "="*80)
print("PREPROCESSING PIPELINE")
print("="*80)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'
)


# ============================================================================
# 10. TRAIN/VALIDATION SPLIT
# ============================================================================

print("\n" + "="*80)
print("TRAIN/VALIDATION SPLIT")
print("="*80)

stratify_param = y if not is_multi_target else None

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=RANDOM_STATE, stratify=stratify_param  # Reduced to 15%
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)


# ============================================================================
# 11. LABEL ENCODING
# ============================================================================

print("\n" + "="*80)
print("LABEL ENCODING")
print("="*80)

if not is_multi_target:
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    n_classes = len(le.classes_)
    print(f"Number of classes: {n_classes}")
else:
    label_encoders = {}
    y_train_enc = pd.DataFrame(index=y_train.index)
    y_val_enc = pd.DataFrame(index=y_val.index)
    for target_col in target_cols:
        le_temp = LabelEncoder()
        y_train_enc[target_col] = le_temp.fit_transform(y_train[target_col])
        y_val_enc[target_col] = le_temp.transform(y_val[target_col])
        label_encoders[target_col] = le_temp
    y_train_enc = y_train_enc.values
    y_val_enc = y_val_enc.values


# ============================================================================
# 12. MODEL COMPARISON
# ============================================================================

if DO_MODEL_COMPARISON and not is_multi_target:
    print("\n" + "="*80)
    print("MODEL COMPARISON - TESTING ALL CLASSIFIERS")
    print("="*80)

    # Optimized models with reduced complexity for speed
    models = {}

    if FAST_MODE:
        # Fast mode - only test top performers
        models = {
            'XGBoost': xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1,
                                         random_state=RANDOM_STATE, eval_metric='logloss',
                                         use_label_encoder=False, n_jobs=N_JOBS),
            'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=15,
                                                   random_state=RANDOM_STATE, n_jobs=N_JOBS),
            'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5,
                                                           random_state=RANDOM_STATE)
        }
    else:
        # Full mode - test all models
        models = {
            'Logistic Regression': LogisticRegression(max_iter=500, random_state=RANDOM_STATE, n_jobs=N_JOBS),
            'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=15),
            'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=15, random_state=RANDOM_STATE, n_jobs=N_JOBS),
            'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_STATE),
            'XGBoost': xgb.XGBClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_STATE,
                                         eval_metric='logloss', use_label_encoder=False, n_jobs=N_JOBS),
            'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=N_JOBS),
            'Naive Bayes': GaussianNB()
        }
        # Skip SVM in default mode as it's very slow
        print("‚ö† Skipping SVM due to slow training time. Set FAST_MODE=False to include it.")

    model_results = {}

    print("\n" + "-"*80)
    print(f"{'Model':<25} {'Train Acc':<12} {'Val Acc':<12} {'Val F1':<12} {'Val ROC-AUC':<12}")
    print("-"*80)

    for name, model in models.items():
        try:
            # Train
            model.fit(X_train_pre, y_train_enc)

            # Predictions
            y_train_pred = model.predict(X_train_pre)
            y_val_pred = model.predict(X_val_pre)

            # Metrics
            train_acc = accuracy_score(y_train_enc, y_train_pred)
            val_acc = accuracy_score(y_val_enc, y_val_pred)
            val_f1 = f1_score(y_val_enc, y_val_pred, average='macro')

            # ROC AUC
            try:
                val_proba = model.predict_proba(X_val_pre)
                if n_classes == 2:
                    val_roc = roc_auc_score(y_val_enc, val_proba[:, 1])
                else:
                    val_roc = roc_auc_score(y_val_enc, val_proba, multi_class='ovr', average='macro')
            except:
                val_roc = 0.0

            model_results[name] = {
                'model': model,
                'train_acc': train_acc,
                'val_acc': val_acc,
                'val_f1': val_f1,
                'val_roc': val_roc
            }

            print(f"{name:<25} {train_acc:<12.4f} {val_acc:<12.4f} {val_f1:<12.4f} {val_roc:<12.4f}")

        except Exception as e:
            print(f"{name:<25} ERROR: {str(e)[:40]}")

    print("-"*80)

    # Find best model
    best_model_name = max(model_results, key=lambda x: model_results[x]['val_roc'])
    print(f"\nüèÜ BEST MODEL: {best_model_name}")
    print(f"   Validation ROC-AUC: {model_results[best_model_name]['val_roc']:.4f}")
    print(f"   Validation Accuracy: {model_results[best_model_name]['val_acc']:.4f}")
    print(f"   Validation F1: {model_results[best_model_name]['val_f1']:.4f}")


# ============================================================================
# 13. HYPERPARAMETER TUNING FOR BEST MODELS
# ============================================================================

tuned_models = {}

if DO_HYPERPARAM_TUNING and not is_multi_target:
    print("\n" + "="*80)
    print("HYPERPARAMETER TUNING - TOP MODELS")
    print("="*80)

    # Sort models by performance
    sorted_models = sorted(model_results.items(), key=lambda x: x[1]['val_roc'], reverse=True)

    # Tune only top 2 models in fast mode, top 3 otherwise
    num_to_tune = 2 if FAST_MODE else 3
    top_models = sorted_models[:num_to_tune]

    print(f"\nTuning top {num_to_tune} models: {[m[0] for m in top_models]}")

    # Reduced hyperparameter grids for speed
    param_grids = {
        'Random Forest': {
            'n_estimators': [200, 400] if FAST_MODE else [300, 500, 800],
            'max_depth': [15, 20, None] if FAST_MODE else [15, 20, 25, None],
            'min_samples_split': [2, 5] if FAST_MODE else [2, 5, 10],
            'min_samples_leaf': [1, 2] if FAST_MODE else [1, 2, 4],
            'max_features': ['sqrt'] if FAST_MODE else ['sqrt', 'log2']
        },
        'XGBoost': {
            'n_estimators': [200, 400] if FAST_MODE else [300, 500, 800],
            'max_depth': [5, 7, 10] if FAST_MODE else [3, 5, 7, 10],
            'learning_rate': [0.05, 0.1] if FAST_MODE else [0.01, 0.05, 0.1],
            'subsample': [0.8, 1.0] if FAST_MODE else [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 1.0] if FAST_MODE else [0.8, 0.9, 1.0]
        },
        'Gradient Boosting': {
            'n_estimators': [150, 300] if FAST_MODE else [200, 300, 500],
            'max_depth': [3, 5] if FAST_MODE else [3, 5, 7],
            'learning_rate': [0.05, 0.1] if FAST_MODE else [0.01, 0.05, 0.1],
            'subsample': [0.8, 1.0] if FAST_MODE else [0.8, 0.9, 1.0]
        },
        'Logistic Regression': {
            'C': [0.1, 1, 10] if FAST_MODE else [0.01, 0.1, 1, 10, 100],
            'penalty': ['l2'],
            'solver': ['lbfgs']
        }
    }

    scoring = 'roc_auc_ovr' if n_classes > 2 else 'roc_auc'
    n_iter = 8 if FAST_MODE else 15
    cv_folds = 3

    for model_name, model_data in top_models:
        if model_name in param_grids:
            print(f"\n{'='*60}")
            print(f"Tuning {model_name}...")
            print('='*60)

            base_model = models[model_name]
            param_grid = param_grids[model_name]

            rnd_search = RandomizedSearchCV(
                estimator=base_model,
                param_distributions=param_grid,
                n_iter=n_iter,
                scoring=scoring,
                cv=cv_folds,
                verbose=0,  # Reduced verbosity
                random_state=RANDOM_STATE,
                n_jobs=N_JOBS
            )

            rnd_search.fit(X_train_pre, y_train_enc)

            print(f"\n‚úì Best parameters for {model_name}:")
            for param, value in rnd_search.best_params_.items():
                print(f"  {param}: {value}")
            print(f"Best CV score: {rnd_search.best_score_:.4f}")

            tuned_models[model_name] = rnd_search.best_estimator_

            # Evaluate tuned model
            y_val_pred_tuned = rnd_search.best_estimator_.predict(X_val_pre)
            val_acc_tuned = accuracy_score(y_val_enc, y_val_pred_tuned)
            val_f1_tuned = f1_score(y_val_enc, y_val_pred_tuned, average='macro')

            try:
                val_proba_tuned = rnd_search.best_estimator_.predict_proba(X_val_pre)
                if n_classes == 2:
                    val_roc_tuned = roc_auc_score(y_val_enc, val_proba_tuned[:, 1])
                else:
                    val_roc_tuned = roc_auc_score(y_val_enc, val_proba_tuned, multi_class='ovr', average='macro')
                print(f"\nTuned {model_name} - Validation Metrics:")
                print(f"  ROC-AUC: {val_roc_tuned:.4f}")
                print(f"  Accuracy: {val_acc_tuned:.4f}")
                print(f"  F1 Score: {val_f1_tuned:.4f}")
            except:
                pass


# ============================================================================
# 14. ENSEMBLE METHODS
# ============================================================================

ensemble_model = None

if DO_ENSEMBLE and not is_multi_target and len(tuned_models) > 0:
    print("\n" + "="*80)
    print("ENSEMBLE METHODS")
    print("="*80)

    # Use only top 2 models for ensemble in fast mode
    if FAST_MODE and len(tuned_models) > 2:
        # Select top 2 tuned models
        sorted_tuned = sorted(tuned_models.items(),
                             key=lambda x: accuracy_score(y_val_enc, x[1].predict(X_val_pre)),
                             reverse=True)[:2]
        ensemble_estimators = sorted_tuned
        print(f"Fast mode: Using top 2 models for ensemble: {[m[0] for m in ensemble_estimators]}")
    else:
        ensemble_estimators = list(tuned_models.items())

    # Voting Classifier only (skip stacking in fast mode for speed)
    print("\n‚úì Creating Voting Classifier (Soft Voting)...")

    voting_clf = VotingClassifier(
        estimators=ensemble_estimators,
        voting='soft',
        n_jobs=N_JOBS
    )

    voting_clf.fit(X_train_pre, y_train_enc)

    y_val_pred_voting = voting_clf.predict(X_val_pre)
    val_acc_voting = accuracy_score(y_val_enc, y_val_pred_voting)
    val_f1_voting = f1_score(y_val_enc, y_val_pred_voting, average='macro')

    try:
        val_proba_voting = voting_clf.predict_proba(X_val_pre)
        if n_classes == 2:
            val_roc_voting = roc_auc_score(y_val_enc, val_proba_voting[:, 1])
        else:
            val_roc_voting = roc_auc_score(y_val_enc, val_proba_voting, multi_class='ovr', average='macro')

        print(f"\nVoting Classifier - Validation Metrics:")
        print(f"  ROC-AUC: {val_roc_voting:.4f}")
        print(f"  Accuracy: {val_acc_voting:.4f}")
        print(f"  F1 Score: {val_f1_voting:.4f}")

        ensemble_model = voting_clf

    except:
        val_roc_voting = 0
        ensemble_model = None

    # Optionally create stacking classifier if not in fast mode
    if not FAST_MODE:
        print("\n‚úì Creating Stacking Classifier...")

        stacking_clf = StackingClassifier(
            estimators=ensemble_estimators,
            final_estimator=LogisticRegression(max_iter=500),
            cv=3,
            n_jobs=N_JOBS
        )

        stacking_clf.fit(X_train_pre, y_train_enc)

        y_val_pred_stacking = stacking_clf.predict(X_val_pre)
        val_acc_stacking = accuracy_score(y_val_enc, y_val_pred_stacking)
        val_f1_stacking = f1_score(y_val_enc, y_val_pred_stacking, average='macro')

        try:
            val_proba_stacking = stacking_clf.predict_proba(X_val_pre)
            if n_classes == 2:
                val_roc_stacking = roc_auc_score(y_val_enc, val_proba_stacking[:, 1])
            else:
                val_roc_stacking = roc_auc_score(y_val_enc, val_proba_stacking, multi_class='ovr', average='macro')

            print(f"\nStacking Classifier - Validation Metrics:")
            print(f"  ROC-AUC: {val_roc_stacking:.4f}")
            print(f"  Accuracy: {val_acc_stacking:.4f}")
            print(f"  F1 Score: {val_f1_stacking:.4f}")
        except:
            val_roc_stacking = 0

        # Choose best ensemble
        if val_roc_stacking >= val_roc_voting:
            ensemble_model = stacking_clf
            print(f"\nüèÜ BEST ENSEMBLE: Stacking Classifier (ROC-AUC: {val_roc_stacking:.4f})")
        else:
            ensemble_model = voting_clf
            print(f"\nüèÜ BEST ENSEMBLE: Voting Classifier (ROC-AUC: {val_roc_voting:.4f})")
    else:
        print(f"\nüèÜ Using Voting Classifier (ROC-AUC: {val_roc_voting:.4f})")
        print("‚ö† Skipping Stacking Classifier in FAST_MODE")


# ============================================================================
# 15. FINAL MODEL SELECTION
# ============================================================================

print("\n" + "="*80)
print("FINAL MODEL SELECTION")
print("="*80)

if not is_multi_target:
    if ensemble_model is not None:
        final_model = ensemble_model
        print("Using ENSEMBLE model for predictions")
    elif tuned_models:
        best_tuned = max(tuned_models.items(),
                        key=lambda x: accuracy_score(y_val_enc, x[1].predict(X_val_pre)))
        final_model = best_tuned[1]
        print(f"Using TUNED {best_tuned[0]} for predictions")
    else:
        final_model = model_results[best_model_name]['model']
        print(f"Using {best_model_name} for predictions")
else:
    # For multi-target, use Random Forest with MultiOutput
    print("Using MultiOutput Random Forest for multi-target prediction")
    n_est = 300 if FAST_MODE else 500
    final_model = MultiOutputClassifier(
        RandomForestClassifier(n_estimators=n_est, max_depth=15, random_state=RANDOM_STATE, n_jobs=N_JOBS),
        n_jobs=N_JOBS
    )
    final_model.fit(X_train_pre, y_train_enc)


# ============================================================================
# 16. GENERATE PREDICTIONS
# ============================================================================

print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

test_pred_enc = final_model.predict(test_data_pre)

if is_multi_target:
    test_pred_decoded = pd.DataFrame(index=range(len(test_pred_enc)))
    for idx, target_col in enumerate(target_cols):
        le_temp = label_encoders[target_col]
        test_pred_decoded[target_col] = le_temp.inverse_transform(test_pred_enc[:, idx])
else:
    test_pred_decoded = le.inverse_transform(test_pred_enc)

if test_ids is not None:
    if is_multi_target:
        submission_df = pd.DataFrame({ID_COL: test_ids})
        for target_col in target_cols:
            submission_df[target_col] = test_pred_decoded[target_col].values
    else:
        submission_df = pd.DataFrame({
            ID_COL: test_ids,
            target_cols[0]: test_pred_decoded
        })
else:
    if is_multi_target:
        submission_df = test_pred_decoded.copy()
    else:
        submission_df = pd.DataFrame({target_cols[0]: test_pred_decoded})

submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úì Submission saved to '{OUTPUT_FILE}'")

print("\nSubmission preview:")
print(submission_df.head(10))

print("\n" + "="*80)
print("‚úì PROCESS COMPLETED SUCCESSFULLY!")
print("="*80)