In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
train=pd.read_csv("/kaggle/input/playground-series-s5e7/train.csv")
test=pd.read_csv("/kaggle/input/playground-series-s5e7/test.csv")

In [3]:
numeric_cols_train = train.select_dtypes(include=np.number).columns
medians_train = train[numeric_cols_train].median()
train[numeric_cols_train] = train[numeric_cols_train].fillna(medians_train)
numeric_cols_test = test.select_dtypes(include=np.number).columns
medians_test = test[numeric_cols_test].median()
test[numeric_cols_test] = test[numeric_cols_test].fillna(medians_test)
le=LabelEncoder()
train['Stage_fear']=le.fit_transform(train['Stage_fear'])
train['Personality']=le.fit_transform(train['Personality'])
train['Drained_after_socializing']=le.fit_transform(train['Drained_after_socializing'])
test['Stage_fear']=le.fit_transform(test['Stage_fear'])
test['Drained_after_socializing']=le.fit_transform(test['Drained_after_socializing'])

In [4]:
y=train['Personality']
X=train.drop('Personality',axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.24)

In [5]:
params_catboost = {
    "iterations": [100,150],
    "learning_rate": [0.001, 0.05, 0.1],
    "depth": [ 6, 8, 10],
    "l2_leaf_reg": [1, 3, 5],
    "border_count": [16,32]
}
catboost_model = CatBoostClassifier(verbose=0)  
grid_catboost = GridSearchCV(estimator=catboost_model, param_grid=params_catboost, scoring='accuracy', cv=5, n_jobs=-1)
grid_catboost.fit(X_train, y_train)

In [6]:
best_catboost_model = grid_catboost.best_estimator_
best_catboost_model.fit(X_train,y_train)
y_test_pred_cat = best_catboost_model.predict(X_test)
test_accuracy_cat = accuracy_score(y_test, y_test_pred_cat)
print(f"CatBoost Test Accuracy: {test_accuracy_cat:.2f}")

CatBoost Test Accuracy: 0.97


In [7]:
params_decision_tree = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 10,  None],
    "min_samples_split": [2, 5, 7],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [None, "sqrt", "log2"]
}
decision_tree_model = DecisionTreeClassifier()
random_decision_tree = RandomizedSearchCV(
    estimator=decision_tree_model, 
    param_distributions=params_decision_tree, 
    n_iter=30, 
    scoring='accuracy', 
    cv=5, 
    random_state=42, 
    n_jobs=-1
)
random_decision_tree.fit(X_train, y_train)
print("DecisionTree Best Parameters:", random_decision_tree.best_params_)
y_test_pred_decision_tree = random_decision_tree.best_estimator_.predict(X_test)
test_accuracy_decision_tree = accuracy_score(y_test, y_test_pred_decision_tree)
print(f"DecisionTree Test Accuracy: {test_accuracy_decision_tree:.2f}")

DecisionTree Best Parameters: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 5, 'criterion': 'entropy'}
DecisionTree Test Accuracy: 0.97


In [8]:
best_dt_model = random_decision_tree.best_estimator_
best_dt_model.fit(X_train, y_train)
y_test_pred_dt = best_dt_model.predict(X_test)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
print(f"DecisionTree Test Accuracy: {test_accuracy_dt:.2f}")

DecisionTree Test Accuracy: 0.97


In [9]:
from sklearn.preprocessing import LabelEncoder

test_encoded = test.copy()
for col in test_encoded.select_dtypes(include=['object']).columns:
    test_encoded[col] = LabelEncoder().fit_transform(test_encoded[col])
test.fillna(0,inplace=True)    



In [10]:
from xgboost import XGBClassifier
xgb_meta = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.4,
    colsample_bytree=0.5,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


In [11]:
estimators = [
    ('dt', best_dt_model),               
    ('catboost', best_catboost_model)    
]


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve

# Optimized Logistic Regression Meta Model
log_meta = LogisticRegression(
    solver='liblinear',  # Good for small datasets, supports L1/L2
    random_state=42,
    class_weight='balanced',  # Handle class imbalance
    max_iter=1000  # Ensure convergence
)
def find_optimal_threshold(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=log_meta,
    passthrough=True,
    cv=5
)
param_grid = {
    'final_estimator__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'final_estimator__penalty': ['l1', 'l2']
}
grid_search = GridSearchCV(
    estimator=stacking_clf,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(X, y)
best_stacking_clf = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best ROC AUC: {grid_search.best_score_:.4f}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []
accuracy_lst = []
test_predict = np.zeros(len(test_encoded))
optimal_thresholds = []

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n✅ Training Fold {fold + 1}...")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    best_stacking_clf.fit(X_train, y_train)
    y_pred_proba = best_stacking_clf.predict_proba(X_val)[:, 1]
    optimal_threshold = find_optimal_threshold(y_val, y_pred_proba)
    optimal_thresholds.append(optimal_threshold)
    y_pred_label = (y_pred_proba >= optimal_threshold).astype(int)
    
    auc = roc_auc_score(y_val, y_pred_proba)
    acc_score = accuracy_score(y_val, y_pred_label)
    
    print(f"Fold {fold + 1} AUC: {auc:.4f} | Accuracy: {acc_score:.4f} | Optimal Threshold: {optimal_threshold:.4f}")
    auc_scores.append(auc)
    accuracy_lst.append(acc_score)
    test_predict += best_stacking_clf.predict_proba(test_encoded)[:, 1] / skf.n_splits
mean_optimal_threshold = np.mean(optimal_thresholds)
final_test_predictions = (test_predict >= mean_optimal_threshold).astype(int)

Best parameters: {'final_estimator__C': 10, 'final_estimator__penalty': 'l1'}
Best ROC AUC: 0.9675

✅ Training Fold 1...
Fold 1 AUC: 0.9692 | Accuracy: 0.9687 | Optimal Threshold: 0.2134

✅ Training Fold 2...
Fold 2 AUC: 0.9665 | Accuracy: 0.9676 | Optimal Threshold: 0.2073

✅ Training Fold 3...
Fold 3 AUC: 0.9614 | Accuracy: 0.9665 | Optimal Threshold: 0.4139

✅ Training Fold 4...
Fold 4 AUC: 0.9697 | Accuracy: 0.9698 | Optimal Threshold: 0.2346

✅ Training Fold 5...
Fold 5 AUC: 0.9707 | Accuracy: 0.9714 | Optimal Threshold: 0.4377


In [13]:
submission=pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

In [14]:
prediction=best_stacking_clf.predict(test)
mapping_dict = {0: 'Extrovert', 1: 'Introvert'}
prediction_string=prediction_string = np.array([mapping_dict[val] for val in prediction])
prediction_string
submission['id']=test['id']
submission['Personality']=prediction_string
submission.to_csv("submission.csv", index=False)

In [15]:
submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
