In [100]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [101]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
train=pd.read_csv("/kaggle/input/playground-series-s5e7/train.csv")
test=pd.read_csv("/kaggle/input/playground-series-s5e7/test.csv")

In [102]:
numeric_cols_train = train.select_dtypes(include=np.number).columns
medians_train = train[numeric_cols_train].median()
train[numeric_cols_train] = train[numeric_cols_train].fillna(medians_train)
numeric_cols_test = test.select_dtypes(include=np.number).columns
medians_test = test[numeric_cols_test].median()
test[numeric_cols_test] = test[numeric_cols_test].fillna(medians_test)
le=LabelEncoder()
train['Stage_fear']=le.fit_transform(train['Stage_fear'])
train['Personality']=le.fit_transform(train['Personality'])
train['Drained_after_socializing']=le.fit_transform(train['Drained_after_socializing'])
test['Stage_fear']=le.fit_transform(test['Stage_fear'])
test['Drained_after_socializing']=le.fit_transform(test['Drained_after_socializing'])

In [103]:
y=train['Personality']
X=train.drop('Personality',axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.24)

In [104]:
params_catboost = {
    "iterations": [100,150],
    "learning_rate": [0.001, 0.05, 0.1],
    "depth": [ 6, 8, 10],
    "l2_leaf_reg": [1, 3, 5],
    "border_count": [16,32]
}
catboost_model = CatBoostClassifier(verbose=0)  
grid_catboost = GridSearchCV(estimator=catboost_model, param_grid=params_catboost, scoring='accuracy', cv=5, n_jobs=-1)
grid_catboost.fit(X_train, y_train)

In [105]:
best_catboost_model = grid_catboost.best_estimator_
best_catboost_model.fit(X_train,y_train)
y_test_pred_cat = best_catboost_model.predict(X_test)
test_accuracy_cat = accuracy_score(y_test, y_test_pred_cat)
print(f"CatBoost Test Accuracy: {test_accuracy_cat:.2f}")

CatBoost Test Accuracy: 0.97


In [106]:
params_decision_tree = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 10,  None],
    "min_samples_split": [2, 5, 7],
    "min_samples_leaf": [1, 2, 3],
    "max_features": [None, "sqrt", "log2"]
}
decision_tree_model = DecisionTreeClassifier()
random_decision_tree = RandomizedSearchCV(
    estimator=decision_tree_model, 
    param_distributions=params_decision_tree, 
    n_iter=30, 
    scoring='accuracy', 
    cv=5, 
    random_state=42, 
    n_jobs=-1
)
random_decision_tree.fit(X_train, y_train)
print("DecisionTree Best Parameters:", random_decision_tree.best_params_)
y_test_pred_decision_tree = random_decision_tree.best_estimator_.predict(X_test)
test_accuracy_decision_tree = accuracy_score(y_test, y_test_pred_decision_tree)
print(f"DecisionTree Test Accuracy: {test_accuracy_decision_tree:.2f}")

DecisionTree Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 5, 'criterion': 'entropy'}
DecisionTree Test Accuracy: 0.96


In [107]:
best_dt_model = random_decision_tree.best_estimator_
best_dt_model.fit(X_train, y_train)
y_test_pred_dt = best_dt_model.predict(X_test)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
print(f"DecisionTree Test Accuracy: {test_accuracy_dt:.2f}")

DecisionTree Test Accuracy: 0.97


In [108]:
from sklearn.preprocessing import LabelEncoder

test_encoded = test.copy()
for col in test_encoded.select_dtypes(include=['object']).columns:
    test_encoded[col] = LabelEncoder().fit_transform(test_encoded[col])
test.fillna(0,inplace=True)    



In [109]:
from xgboost import XGBClassifier
xgb_meta = XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


In [120]:
estimators = [
    ('dt', best_dt_model),               
    ('catboost', best_catboost_model)    
]


In [116]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=xgb_meta,
    passthrough=True,
    cv=5
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []
accuracy_lst = []
test_predict = np.zeros(len(test))  
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n✅ Training Fold {fold + 1}...")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    stacking_clf.fit(X_train, y_train)
    y_pred_proba = stacking_clf.predict_proba(X_val)[:, 1]
    y_pred_label = (y_pred_proba >= 0.5).astype(int)
    auc = roc_auc_score(y_val, y_pred_proba)
    acc_score = accuracy_score(y_val, y_pred_label)
    print(f"Fold {fold + 1} AUC: {auc:.4f} | Accuracy: {acc_score:.4f}")
    auc_scores.append(auc)
    accuracy_lst.append(acc_score)
    test_predict += stacking_clf.predict_proba(test)[:, 1] / skf.n_splits
    


✅ Training Fold 1...
Fold 1 AUC: 0.9731 | Accuracy: 0.9700

✅ Training Fold 2...
Fold 2 AUC: 0.9694 | Accuracy: 0.9676

✅ Training Fold 3...
Fold 3 AUC: 0.9629 | Accuracy: 0.9660

✅ Training Fold 4...
Fold 4 AUC: 0.9694 | Accuracy: 0.9700

✅ Training Fold 5...
Fold 5 AUC: 0.9715 | Accuracy: 0.9717


In [117]:
submission=pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

In [118]:
prediction=stacking_clf.predict(test)
prediction

array([0, 1, 0, ..., 0, 0, 1])

In [121]:
prediction=stacking_clf.predict(test)
mapping_dict = {0: 'Extrovert', 1: 'Introvert'}
prediction_string=prediction_string = np.array([mapping_dict[val] for val in prediction])
prediction_string
submission['id']=test['id']
submission['Personality']=prediction_string
submission.to_csv("submission.csv", index=False)

In [122]:
submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
