In [None]:
# ============================================
# UNIVERSAL KAGGLE ML PIPELINE (Classification)
# Auto Model Selection + Imputer + Label Encode
# 100% Stable for ANY Dataset
# ============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# -----------------------------------------------
# LOAD DATASETS
# -----------------------------------------------
train_df = pd.read_csv("/kaggle/input/ai-201-b-makeup-exam-aiml/train.csv")
test_df = pd.read_csv("/kaggle/input/ai-201-b-makeup-exam-aiml/test.csv")
sample_submission = pd.read_csv("/kaggle/input/ai-201-b-makeup-exam-aiml/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample submission shape:", sample_submission.shape)

# -----------------------------------------------
# TARGET (last column)
# -----------------------------------------------
target = train_df.columns[-1]

X = train_df.drop(columns=[target])

# Encode target
le = LabelEncoder()
y = le.fit_transform(train_df[target])

# Test data
X_test_final = test_df.copy()

# -----------------------------------------------
# TRAINâ€“VALIDATION SPLIT
# -----------------------------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------------------------
# DETECT COLUMN TYPES
# -----------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# -----------------------------------------------
# PREPROCESSOR (Impute + OneHot + Scale)
# -----------------------------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), num_cols),
    ]
)

# -----------------------------------------------
# MODELS
# -----------------------------------------------
models = {
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=-1,
        n_jobs=-1
    ),
    "CatBoost": CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        thread_count=-1
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        n_jobs=-1
    )
}

# -----------------------------------------------
# MODEL TRAINING LOOP
# -----------------------------------------------
best_model = None
best_logloss = np.inf
best_acc = 0.0
best_name = ""

for name, model in models.items():
    print(f"\nTraining {name} ...")

    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_valid)
    prob = pipe.predict_proba(X_valid)

    acc = accuracy_score(y_valid, preds)
    ll = log_loss(y_valid, prob)

    print(f"===== {name} =====")
    print("Accuracy:", acc)
    print("LogLoss :", ll)

    if ll < best_logloss:
        best_logloss = ll
        best_acc = acc
        best_model = pipe
        best_name = name

# -----------------------------------------------
# BEST MODEL
# -----------------------------------------------
print("\n===============================")
print(" BEST MODEL SELECTED AUTOMATICALLY ")
print("===============================")
print("Model      :", best_name)
print("Accuracy   :", best_acc)
print("LogLoss    :", best_logloss)

# -----------------------------------------------
# FINAL PREDICTIONS
# -----------------------------------------------
final_preds = best_model.predict(X_test_final)

# -----------------------------------------------
# SAFE SUBMISSION (matches test.csv rows)
# -----------------------------------------------
submission = pd.DataFrame()

# Identify ID column for submission
id_col = sample_submission.columns[0]

if id_col in test_df.columns:
    submission[id_col] = test_df[id_col]
else:
    submission[id_col] = np.arange(len(test_df))

submission[target] = le.inverse_transform(final_preds)

submission.to_csv("submission_final.csv", index=False)
print("\nsubmission_final.csv saved!")
print(submission.head())