In [None]:
# 1. SETUP AND DATA LOADING
# ==================================
import numpy as np
import pandas as pd
import warnings
import os
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Load all datasets
print("Loading data...")
train_df_orig = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_df_orig = pd.read_csv("/kaggle/input/playground-series-s5e7/test.csv")
org_df = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data/personality_datasert.csv')
org_df_ = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data/personality_dataset.csv')
print("Data loaded successfully.")

# Combine training data
train = pd.concat([train_df_orig, org_df, org_df_], ignore_index=True, axis=0)

# Separate test ID and drop from test set
test_id = test_df_orig['id']
test = test_df_orig.drop(['id'], axis=1)


In [None]:
# 2. DATA PREPROCESSING
# ==================================
print("Starting data preprocessing...")

# Map categorical string values to numbers
train['Stage_fear'] = train['Stage_fear'].map({'Yes': 1, 'No': 0})
test['Stage_fear'] = test['Stage_fear'].map({'Yes': 1, 'No': 0})
train['Drained_after_socializing'] = train['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
test['Drained_after_socializing'] = test['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

# Map target variable 'Personality' to numbers
train['Personality'] = train['Personality'].map({'Extrovert': 0, 'Introvert': 1})

# Define feature columns and target variable
feature_cols = test.columns
X = train[feature_cols]
y = train['Personality']

# Impute missing values and scale features
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5)

# Fit on training data and transform both train and test
X_scaled = scaler.fit_transform(X)
X_imputed = imputer.fit_transform(X_scaled)

test_scaled = scaler.transform(test)
test_imputed = imputer.transform(test_scaled)

# Convert back to DataFrame
X_processed = pd.DataFrame(X_imputed, columns=feature_cols)
test_processed = pd.DataFrame(test_imputed, columns=feature_cols)
print("Preprocessing complete.")


In [None]:
# 3. HYPERPARAMETER TUNING WITH OPTUNA
# ==================================

# Split data for tuning validation
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, stratify=y, random_state=42
)

def objective(trial):
    """Optuna objective function with a reduced search space for faster tuning."""
    # -- Hyperparameters for XGBoost --
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 200, 800),  # Reduced range
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 7),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.6, 1.0),
    }

    # -- Hyperparameters for CatBoost --
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 200, 800),  # Reduced range
        'depth': trial.suggest_int('cat_depth', 4, 8),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.2, log=True),
        'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
    }

    # -- Hyperparameters for LightGBM --
    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 200, 800),  # Reduced range
        'num_leaves': trial.suggest_int('lgbm_num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('lgbm_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.6, 1.0),
    }
    
    # Initialize models
    xgb = XGBClassifier(**xgb_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    cat = CatBoostClassifier(**cat_params, random_seed=42, verbose=0)
    lgbm = LGBMClassifier(**lgbm_params, random_state=42)

    # Create the voting ensemble
    ensemble = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat), ('lgbm', lgbm)], voting='soft')
    
    # Train and evaluate
    ensemble.fit(X_train, y_train)
    val_preds = ensemble.predict(X_val)
    accuracy = accuracy_score(y_val, val_preds)
    
    return accuracy

print("Starting Optuna hyperparameter search...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

print(f"\nBest trial accuracy: {study.best_value:.4f}")
print("Best hyperparameters found:")
print(study.best_params)


In [None]:
# 4. FINAL MODEL TRAINING (SUBMISSION STEPS REMOVED)
# ===================================================
print("\nTraining final model on all data with best hyperparameters...")

# Extract the best parameters for each model
best_params = study.best_params
final_xgb_params = {k.replace('xgb_', ''): v for k, v in best_params.items() if k.startswith('xgb_')}
final_cat_params = {k.replace('cat_', ''): v for k, v in best_params.items() if k.startswith('cat_')}
final_lgbm_params = {k.replace('lgbm_', ''): v for k, v in best_params.items() if k.startswith('lgbm_')}

# Initialize models with the best found hyperparameters
final_xgb = XGBClassifier(**final_xgb_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
final_cat = CatBoostClassifier(**final_cat_params, random_seed=42, verbose=0)
final_lgbm = LGBMClassifier(**final_lgbm_params, random_state=42)

# Create the final voting ensemble
final_ensemble = VotingClassifier(
    estimators=[('xgb', final_xgb), ('cat', final_cat), ('lgbm', final_lgbm)],
    voting='soft'
)

# Train the final model on the ENTIRE processed dataset
final_ensemble.fit(X_processed, y)

# Optional: Evaluate on the held-out validation set again
val_preds_final = final_ensemble.predict(X_val)
final_val_accuracy = accuracy_score(y_val, val_preds_final)
print(f"Final ensemble validation accuracy (re-evaluated): {final_val_accuracy:.4f}")
