In [None]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Adjust display settings
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Automatically adjust display width
pd.set_option('display.max_colwidth', None) # Show full content in each column

In [None]:
train_data = pd.read_csv("../train.csv")
test_data = pd.read_csv("../test.csv")
train_data.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [None]:
train_corr = train_data.drop(columns=['id', 'Personality'], errors='ignore')
train_corr_encoded = pd.get_dummies(train_corr, drop_first=True)
correlation_matrix = train_corr_encoded.corr()
correlation_matrix

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_Yes,Drained_after_socializing_Yes
Time_spent_Alone,1.0,-0.628806,-0.640884,-0.598014,-0.611544,0.740421,0.743776
Social_event_attendance,-0.628806,1.0,0.585224,0.566675,0.566679,-0.675029,-0.677429
Going_outside,-0.640884,0.585224,1.0,0.549864,0.579305,-0.678779,-0.683489
Friends_circle_size,-0.598014,0.566675,0.549864,1.0,0.522272,-0.635122,-0.639369
Post_frequency,-0.611544,0.566679,0.579305,0.522272,1.0,-0.645127,-0.65039
Stage_fear_Yes,0.740421,-0.675029,-0.678779,-0.635122,-0.645127,1.0,0.775335
Drained_after_socializing_Yes,0.743776,-0.677429,-0.683489,-0.639369,-0.65039,0.775335,1.0


In [None]:
# Step 1: Select only the relevant features and target
selected_features = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency',
    'Stage_fear_Yes',
    'Drained_after_socializing_Yes'
]

personality = train_data['Personality']
df = pd.get_dummies(train_data.drop(columns='Personality'), drop_first=True)
df['Personality'] = personality


group1_features = ['Time_spent_Alone', 'Stage_fear_Yes', 'Drained_after_socializing_Yes']

df_group1 = df[group1_features].copy()
df_group1['interaction_Alone_StageFear'] = df['Time_spent_Alone'] * df['Stage_fear_Yes']
df_group1['interaction_Alone_Drained'] = df['Time_spent_Alone'] * df['Drained_after_socializing_Yes']
df_group1['interaction_StageFear_Drained'] = df['Stage_fear_Yes'] * df['Drained_after_socializing_Yes']

# Group 2 features (and their interactions)
group2_features = ['Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']

df_group2 = df[group2_features].copy()
# Create pairwise interactions for group 2 (you can add as needed)
from itertools import combinations

for f1, f2 in combinations(group2_features, 2):
    col_name = f'interaction_{f1}_{f2}'
    df_group2[col_name] = df[f1] * df[f2]

# Fill missing values to avoid NaNs
df_group1_filled = df_group1.fillna(df_group1.mean())
df_group2_filled = df_group2.fillna(df_group2.mean())

# Scale the features
scaler1 = StandardScaler()
X1_scaled = scaler1.fit_transform(df_group1_filled)

scaler2 = StandardScaler()
X2_scaled = scaler2.fit_transform(df_group2_filled)

# Apply PCA
pca1 = PCA(n_components=1)
df['PCA_group1'] = pca1.fit_transform(X1_scaled)

pca2 = PCA(n_components=1)
df['PCA_group2'] = pca2.fit_transform(X2_scaled)

df['pca_sum'] = df["PCA_group1"] + df["PCA_group1"]


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

def train_stacked_model_with_kfold(df, n_splits=5):
    df = df.copy()

    # 1. Drop ID column if present
    df = df.drop(columns=[col for col in df.columns if 'id' in col.lower()], errors='ignore')

    # 2. Encode target
    le = LabelEncoder()
    df['Personality'] = le.fit_transform(df['Personality'])  # e.g., 0: Extrovert, 1: Introvert

    # 3. Features and target
    X = df.drop(columns=['Personality']).fillna(df.mean(numeric_only=True))
    y = df['Personality']

    # 4. Prepare stratified folds
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_preds = np.zeros(len(df), dtype=int)

    print("=== Cross-Validated Performance ===")
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # 5. Base models
        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        rf = RandomForestClassifier(random_state=42)

        # 6. Stacking model
        model = StackingClassifier(
            estimators=[
                ('xgb', xgb),
                ('rf', rf)
            ],
            final_estimator=LogisticRegression(),
            passthrough=True
        )

        # 7. Train and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        all_preds[test_idx] = y_pred

        print(f"\nFold {fold} Classification Report:")
        print(classification_report(y_test, y_pred))

    # 8. Add predictions
    df['Predicted_Personality'] = le.inverse_transform(all_preds)
    df['Personality'] = le.inverse_transform(df['Personality'])

    print("\n=== Sample Predictions ===")
    print(df[['Personality', 'Predicted_Personality']].head())

    # 9. Return last trained model for external prediction use
    return model, df


In [None]:
#df[(df["Personality"] == df["Predicted_Personality"])].isna().sum()
#df[(df["Personality"] != df["Predicted_Personality"])].isna().sum()
#df[(df["Personality"] != df["Predicted_Personality"])]
#df["sum_pca"] = df["PCA_group1"] + df["PCA_group1"]

#& df["PCA_group1"] > 4.0

result, model = train_stacked_model_with_kfold(df)

=== Cross-Validated Performance ===


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2740
           1       0.94      0.94      0.94       965

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2740
           1       0.94      0.93      0.94       965

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Fold 3 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2740
           1       0.94      0.93      0.93       965

    accuracy                           0.96      3705
   macro avg       0.96      0.95      0.95      3705
weighted avg       0.96      0.96      0.96      3705



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Fold 4 Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2740
           1       0.95      0.93      0.94       965

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Fold 5 Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2739
           1       0.94      0.95      0.95       965

    accuracy                           0.97      3704
   macro avg       0.96      0.96      0.96      3704
weighted avg       0.97      0.97      0.97      3704


=== Sample Predictions ===
  Personality Predicted_Personality
0   Extrovert             Extrovert
1   Extrovert             Extrovert
2   Introvert             Introvert
3   Extrovert             Extrovert
4   Extrovert             Extrovert
