In [None]:
# importing the useful libraries for the work
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from scipy import stats
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# 1. USER CONFIGURATION

TRAIN_PATH = "/kaggle/input/mse-2-ai-201-b-ai-d/train.csv"
TEST_PATH = "/kaggle/input/mse-2-ai-201-b-ai-d/test.csv"
TARGET_COL = "Class"
ID_COL = "id"
OUTPUT_FILE = "submission.csv"
# Toggle heavy steps
DO_PLOTTING = True
DO_OUTLIER_CAP = False
DO_HYPERPARAM_TUNING = True
RANDOM_STATE = 42


# 2. Load Data
print("Loading data...")
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

# Keep test IDs for submission
test_ids = test_data[ID_COL].copy() if ID_COL in test_data.columns else None

# Drop ID cols from feature tables
if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])


# DATA CLEANING

print("\n=== Data Cleaning ===")

# 1) Basic info
print("\nTrain info:")
print(train_data.info())
print("\nTest info:")
print(test_data.info())

# 2) Duplicates
train_dups = train_data.duplicated().sum()
print(f"\nDuplicate rows in train: {train_dups}")
if train_dups > 0:
    print("Dropping duplicate rows from train.")
    train_data = train_data.drop_duplicates().reset_index(drop=True)

# 3) Missing value summary - ENHANCED
print("\n=== Missing Values Analysis ===")
train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

print("\nMissing values in TRAIN:")
print(train_missing[train_missing > 0])
print(f"\nTotal missing values in train: {train_missing.sum()}")
print(f"Percentage: {(train_missing.sum() / (train_data.shape[0] * train_data.shape[1]) * 100):.2f}%")

print("\nMissing values in TEST:")
print(test_missing[test_missing > 0])
print(f"\nTotal missing values in test: {test_missing.sum()}")
print(f"Percentage: {(test_missing.sum() / (test_data.shape[0] * test_data.shape[1]) * 100):.2f}%")

# 4) Detect columns with excessive missing values (>50%)
high_missing_cols = train_missing[train_missing > len(train_data) * 0.5].index.tolist()
if high_missing_cols:
    print(f"\nColumns with >50% missing values: {high_missing_cols}")
    print("Consider dropping these columns or using special imputation strategies.")

# 5) Inconsistent categorical values
cat_columns_guess = train_data.select_dtypes(include=['object', 'category']).columns.tolist()
# Remove target column from categorical check
if TARGET_COL in cat_columns_guess:
    cat_columns_guess.remove(TARGET_COL)

print(f"\nDetected categorical columns: {cat_columns_guess}")
for c in cat_columns_guess:
    unique_vals = train_data[c].dropna().unique()
    if len(unique_vals) <= 20:
        print(f" Column `{c}` unique values: {unique_vals}")
    else:
        print(f" Column `{c}` has {len(unique_vals)} unique values")

# 6) Target distribution check - ENHANCED for multiclass
if TARGET_COL in train_data.columns:
    print("\n=== Target Distribution (Multiclass) ===")
    target_counts = train_data[TARGET_COL].value_counts()
    target_props = train_data[TARGET_COL].value_counts(normalize=True)

    print("\nClass counts:")
    print(target_counts)
    print("\nClass proportions:")
    print(target_props)

    # Check for class imbalance
    n_classes = len(target_counts)
    print(f"\nNumber of classes: {n_classes}")

    if n_classes < 2:
        raise ValueError("Target column has less than 2 classes. Cannot perform classification.")

    # Check for severe imbalance
    min_class_prop = target_props.min()
    if min_class_prop < 0.05:
        print(f"WARNING: Severe class imbalance detected. Minimum class proportion: {min_class_prop:.4f}")
        print("Using class_weight='balanced' to handle imbalance.")
else:
    raise ValueError(f"Target column {TARGET_COL} not found in train data.")

# Keep a copy for visualization
train_viz = train_data.copy()


# PREPROCESSING

print("\n=== Feature Engineering ===")

# Separate X and y - ENHANCED to handle NaN in target
X = train_data.drop(columns=[TARGET_COL])
y = train_data[TARGET_COL]

# Check for NaN in target variable
if y.isnull().sum() > 0:
    print(f"\nWARNING: {y.isnull().sum()} NaN values found in target variable.")
    print("Dropping rows with NaN target values...")
    valid_idx = ~y.isnull()
    X = X[valid_idx].reset_index(drop=True)
    y = y[valid_idx].reset_index(drop=True)
    print(f"New training shape: {X.shape}")

# Dynamic feature detection
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nDetected {len(cat_cols)} categorical columns: {cat_cols}")
print(f"Detected {len(num_cols)} numerical columns: {num_cols}")

# Check data types of test to ensure consistency
print("\nVerifying test data columns match train...")
for col in X.columns:
    if col in test_data.columns:
        if X[col].dtype != test_data[col].dtype:
            print(f"WARNING: Column '{col}' has different dtypes: train={X[col].dtype}, test={test_data[col].dtype}")


# OUTLIER ANALYSIS & HANDLING

print("\n=== Outlier Analysis ===")
if DO_PLOTTING and len(num_cols) > 0:
    # Histograms for numeric columns
    n_num = len(num_cols)
    ncols = 3
    nrows = (n_num + ncols - 1) // ncols
    plt.figure(figsize=(5 * ncols, 4 * nrows))
    for i, col in enumerate(num_cols, 1):
        plt.subplot(nrows, ncols, i)
        sns.histplot(train_viz[col].dropna(), kde=True)
        plt.title(col)
    plt.tight_layout()
    plt.show()

    # Boxplots to visualize outliers
    plt.figure(figsize=(5 * ncols, 4 * nrows))
    for i, col in enumerate(num_cols, 1):
        plt.subplot(nrows, ncols, i)
        sns.boxplot(x=train_viz[col].dropna())
        plt.title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.show()

    # Correlation heatmap for numerical features
    if n_num > 1:
        plt.figure(figsize=(10, 8))
        corr = train_viz[num_cols].corr()
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
        plt.title("Correlation matrix (numeric features)")
        plt.show()

    # Countplots for categorical features
    for c in cat_cols:
        plt.figure(figsize=(8, 5))
        top_cats = train_viz[c].value_counts().head(20).index
        sns.countplot(y=c, data=train_viz[train_viz[c].isin(top_cats)],
                     order=top_cats)
        plt.title(f"Top 20 counts for {c}")
        plt.tight_layout()
        plt.show()

    # Target distribution plot - ENHANCED for multiclass
    plt.figure(figsize=(10, 6))
    train_viz[TARGET_COL].value_counts().plot(kind='bar')
    plt.title(f"Target Variable Distribution ({TARGET_COL})")
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# IQR based outlier detection
outlier_summary = {}
for col in num_cols:
    q1 = X[col].quantile(0.25)
    q3 = X[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    n_lower = (X[col] < lower).sum()
    n_upper = (X[col] > upper).sum()
    outlier_summary[col] = {
        "lower": lower,
        "upper": upper,
        "n_lower": int(n_lower),
        "n_upper": int(n_upper)
    }

print("\nOutlier summary (IQR method):")
for col, s in outlier_summary.items():
    print(f" {col}: {s['n_lower']} below threshold, {s['n_upper']} above threshold")

# Cap outliers if enabled
if DO_OUTLIER_CAP:
    print("\nCapping numeric outliers using IQR thresholds...")
    for col, s in outlier_summary.items():
        lower, upper = s['lower'], s['upper']
        X[col] = X[col].clip(lower=lower, upper=upper)
        if col in test_data.columns:
            test_data[col] = test_data[col].clip(lower=lower, upper=upper)
    print("Outlier capping completed.")


# MISSING VALUES IMPUTATION - ENHANCED

print("\n=== Advanced Missing Value Imputation ===")

# Strategy 1: For numeric columns - use median (more robust than mean)
if num_cols:
    print("\nImputing numeric columns with MEDIAN (robust to outliers)...")
    num_imputer = SimpleImputer(strategy='median')
    X[num_cols] = num_imputer.fit_transform(X[num_cols])
    test_data[num_cols] = num_imputer.transform(test_data[num_cols])
    print("Numeric imputation completed.")

# Strategy 2: For categorical columns - use most frequent (mode)
if cat_cols:
    print("\nImputing categorical columns with MOST_FREQUENT...")
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
    test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])
    print("Categorical imputation completed.")

# Verify no missing values remain
print("\n=== Post-Imputation Verification ===")
train_missing_after = X.isnull().sum().sum()
test_missing_after = test_data.isnull().sum().sum()

print(f"Missing values in train after imputation: {train_missing_after}")
print(f"Missing values in test after imputation: {test_missing_after}")

if train_missing_after > 0 or test_missing_after > 0:
    print("\nWARNING: Some missing values still remain!")
    print("\nTrain columns with missing values:")
    print(X.isnull().sum()[X.isnull().sum() > 0])
    print("\nTest columns with missing values:")
    print(test_data.isnull().sum()[test_data.isnull().sum() > 0])


# DEFINE PREPROCESSOR - ENHANCED

print("\n=== Building Preprocessing Pipeline ===")

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'  # Keep any other columns as-is
)

print("Preprocessing pipeline created.")


# TRAIN/VALIDATION SPLIT - STRATIFIED for multiclass

print("\n=== Creating Train/Validation Split ===")

# Use stratified split to maintain class distribution
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=RANDOM_STATE,
    stratify=y  # IMPORTANT for multiclass
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print("\nTrain set class distribution:")
print(y_train.value_counts())
print("\nValidation set class distribution:")
print(y_val.value_counts())

# Fit preprocessor
print("\nTransforming data (fitting preprocessor on train)...")
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)

print(f"Transformed train shape: {X_train_pre.shape}")
print(f"Transformed validation shape: {X_val_pre.shape}")
print(f"Transformed test shape: {test_data_pre.shape}")


# LABEL ENCODING - ENHANCED for multiclass

print("\n=== Label Encoding Target Variable ===")

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

print(f"Number of unique classes: {len(le.classes_)}")
print(f"Class labels: {le.classes_}")
print(f"Encoded values: {np.unique(y_train_enc)}")


# BASELINE MODEL - ENHANCED for multiclass

print("\n=== Baseline RandomForest Training (Multiclass) ===")

baseline_rfc = RandomForestClassifier(
    n_estimators=1000,
    random_state=RANDOM_STATE,
    class_weight='balanced',  # Handle class imbalance
    n_jobs=-1,
    max_depth=20,  # Prevent overfitting
    min_samples_split=5
)

print("Training baseline model...")
baseline_rfc.fit(X_train_pre, y_train_enc)
print("Baseline model training completed.")

# Baseline predictions
train_proba_base = baseline_rfc.predict_proba(X_train_pre)
val_proba_base = baseline_rfc.predict_proba(X_val_pre)

# Metrics for baseline - ENHANCED for multiclass
n_classes_base = len(le.classes_)
print(f"\nNumber of classes: {n_classes_base}")

if n_classes_base == 2:
    # Binary classification
    roc_train_base = roc_auc_score(y_train_enc, train_proba_base[:, 1])
    roc_val_base = roc_auc_score(y_val_enc, val_proba_base[:, 1])
else:
    # Multiclass classification - use 'ovr' (one-vs-rest)
    roc_train_base = roc_auc_score(y_train_enc, train_proba_base,
                                   multi_class='ovr', average='macro')
    roc_val_base = roc_auc_score(y_val_enc, val_proba_base,
                                 multi_class='ovr', average='macro')

loss_train_base = log_loss(y_train_enc, train_proba_base)
loss_val_base = log_loss(y_val_enc, val_proba_base)

print("\n=== Baseline Model Performance ===")
print(f"Training ROC AUC (macro): {roc_train_base:.4f}")
print(f"Validation ROC AUC (macro): {roc_val_base:.4f}")
print(f"Training Log Loss: {loss_train_base:.4f}")
print(f"Validation Log Loss: {loss_val_base:.4f}")

# Classification report for validation set
y_val_pred = baseline_rfc.predict(X_val_pre)
print("\nValidation Classification Report:")
print(classification_report(y_val_enc, y_val_pred, target_names=le.classes_))


# HYPERPARAMETER TUNING - ENHANCED for multiclass

tuned_rfc = None
if DO_HYPERPARAM_TUNING:
    print("\n=== Hyperparameter Tuning (RandomizedSearchCV) ===")

    # Enhanced parameter grid for multiclass
    param_dist = {
        "n_estimators": [200, 500, 800, 1000, 1500],
        "max_depth": [None, 10, 15, 20, 25],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", 0.3, 0.5],
        "criterion": ["gini", "entropy"]
    }

    # Choose appropriate scoring metric
    scoring = 'roc_auc_ovr' if n_classes_base > 2 else 'roc_auc'

    rnd_search = RandomizedSearchCV(
        estimator=RandomForestClassifier(
            random_state=RANDOM_STATE,
            class_weight='balanced',
            n_jobs=-1
        ),
        param_distributions=param_dist,
        n_iter=30,  # Increased for better search
        scoring=scoring,
        cv=5,  # 5-fold CV for better estimate
        verbose=2,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    print("Starting hyperparameter search (this may take a while)...")
    rnd_search.fit(X_train_pre, y_train_enc)

    print("\n=== Tuning Results ===")
    print("Best parameters found:")
    for param, value in rnd_search.best_params_.items():
        print(f"  {param}: {value}")
    print(f"\nBest CV score: {rnd_search.best_score_:.4f}")

    tuned_rfc = rnd_search.best_estimator_

    # Evaluate tuned model
    train_proba_tuned = tuned_rfc.predict_proba(X_train_pre)
    val_proba_tuned = tuned_rfc.predict_proba(X_val_pre)

    if n_classes_base == 2:
        roc_train_tuned = roc_auc_score(y_train_enc, train_proba_tuned[:, 1])
        roc_val_tuned = roc_auc_score(y_val_enc, val_proba_tuned[:, 1])
    else:
        roc_train_tuned = roc_auc_score(y_train_enc, train_proba_tuned,
                                       multi_class='ovr', average='macro')
        roc_val_tuned = roc_auc_score(y_val_enc, val_proba_tuned,
                                     multi_class='ovr', average='macro')

    loss_train_tuned = log_loss(y_train_enc, train_proba_tuned)
    loss_val_tuned = log_loss(y_val_enc, val_proba_tuned)

    print("\n=== Tuned Model Performance ===")
    print(f"Training ROC AUC (macro): {roc_train_tuned:.4f}")
    print(f"Validation ROC AUC (macro): {roc_val_tuned:.4f}")
    print(f"Training Log Loss: {loss_train_tuned:.4f}")
    print(f"Validation Log Loss: {loss_val_tuned:.4f}")

    # Classification report for tuned model
    y_val_pred_tuned = tuned_rfc.predict(X_val_pre)
    print("\nTuned Model - Validation Classification Report:")
    print(classification_report(y_val_enc, y_val_pred_tuned, target_names=le.classes_))

    # Improvement comparison
    print("\n=== Improvement Summary ===")
    print(f"ROC AUC improvement: {(roc_val_tuned - roc_val_base):.4f}")
    print(f"Log Loss improvement: {(loss_val_base - loss_val_tuned):.4f}")
else:
    print("\nHyperparameter tuning skipped by configuration.")


# SELECT MODEL FOR SUBMISSION

use_tuned = True if (DO_HYPERPARAM_TUNING and tuned_rfc is not None) else False
model_for_submission = tuned_rfc if use_tuned else baseline_rfc

if use_tuned:
    print("\n✓ Using TUNED model for final predictions.")
else:
    print("\n✓ Using BASELINE model for final predictions.")


# FEATURE IMPORTANCE ANALYSIS

print("\n=== Feature Importance Analysis ===")

# Get feature names safely
try:
    if len(cat_cols) > 0:
        cat_feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols))
    else:
        cat_feature_names = []

    feature_names = cat_feature_names + num_cols
except:
    # Fallback if get_feature_names_out fails
    print("Using simplified feature names...")
    feature_names = [f"cat_feature_{i}" for i in range(len(cat_cols) * 10)] + num_cols
    feature_names = feature_names[:len(model_for_submission.feature_importances_)]

importances = model_for_submission.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 20 most important features:")
for i in range(min(20, len(feature_names))):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

if DO_PLOTTING and len(feature_names) > 0:
    plt.figure(figsize=(10, 8))
    top_n = min(20, len(feature_names))
    plt.barh(range(top_n), importances[indices[:top_n]][::-1])
    plt.yticks(range(top_n), [feature_names[i] for i in indices[:top_n]][::-1])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.show()


# CONFUSION MATRIX VISUALIZATION

if DO_PLOTTING:
    print("\n=== Confusion Matrix ===")
    y_val_pred_final = model_for_submission.predict(X_val_pre)
    cm = confusion_matrix(y_val_enc, y_val_pred_final)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title('Confusion Matrix - Validation Set')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()


# GENERATE SUBMISSION - ENHANCED

print("\n=== Generating Final Submission ===")

# Get predictions
test_pred_enc = model_for_submission.predict(test_data_pre)
test_pred = le.inverse_transform(test_pred_enc)

# Get prediction probabilities for analysis
test_pred_proba = model_for_submission.predict_proba(test_data_pre)

# Create submission dataframe
if test_ids is not None:
    submission_df = pd.DataFrame({
        ID_COL: test_ids,
        TARGET_COL: test_pred
    })
else:
    submission_df = pd.DataFrame({TARGET_COL: test_pred})

# Save submission
submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"\n✓ Submission saved to {OUTPUT_FILE}")

# Display submission statistics
print("\n=== Submission Statistics ===")
print(f"Total predictions: {len(submission_df)}")
print("\nPredicted class distribution:")
print(submission_df[TARGET_COL].value_counts())
print("\nPredicted class proportions:")
print(submission_df[TARGET_COL].value_counts(normalize=True))

print("\nSubmission preview:")
print(submission_df.head(10))

# Prediction confidence analysis
max_proba = test_pred_proba.max(axis=1)
print(f"\nPrediction confidence statistics:")
print(f"Mean confidence: {max_proba.mean():.4f}")
print(f"Median confidence: {np.median(max_proba):.4f}")
print(f"Min confidence: {max_proba.min():.4f}")
print(f"Max confidence: {max_proba.max():.4f}")

if DO_PLOTTING:
    plt.figure(figsize=(10, 6))
    plt.hist(max_proba, bins=50, edgecolor='black')
    plt.xlabel('Prediction Confidence (Max Probability)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Prediction Confidence')
    plt.axvline(max_proba.mean(), color='r', linestyle='--',
                label=f'Mean: {max_proba.mean():.3f}')
    plt.legend()
    plt.tight_layout()
    plt.show()

print("\n" + "="*60)
print("✓ SUBMISSION COMPLETED SUCCESSFULLY!")
print("="*60)