In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [2]:

# Load data
data = pd.read_csv('Final_GP_Dataset5.csv')


In [3]:
data.drop(['age', 'Work environment', 'Gender','Department','employment status','Years of experience'], axis=1, inplace=True)


In [4]:
# Replace special JSON characters in column names
data.columns = [col.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace(':', '').replace(',', '') for col in data.columns]


In [5]:
# Preprocess data
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le


In [6]:
data.dtypes

Unnamed 0                                                            int64
Do you often feel overwhelmed by your workload?                      int64
Is your workload generally manageable?                               int64
Do you frequently work beyond your scheduled hours?                  int64
Do you feel supported by your manager and team?                      int64
Are you satisfied with the resources and help available at work?     int64
Do you often feel isolated when dealing with work challenges?        int64
Do you frequently think about work in your personal time?            int64
Do you feel you have a healthy work-life balance?                    int64
Does your work often interfere with personal or family time?         int64
Do you regularly feel physically or emotionally exhausted?           int64
Do you experience symptoms like sleep issues anxiety or low mood?    int64
Do you regularly engage in physical or leisure activities?           int64
How do you feel?         

In [7]:
# Define features and target variable
X = data.drop('Do you regularly feel physically or emotionally exhausted?', axis=1)
y = data['Do you regularly feel physically or emotionally exhausted?']

In [8]:
# Function to get user input for each feature
def get_user_input():
    user_input = {}
    for col in X.columns:
        user_input[col] = input(f"Enter value for {col}: ")
    return pd.DataFrame([user_input])

In [9]:

# Get user input
user_data = get_user_input()

Enter value for Unnamed 0: 1
Enter value for Do you often feel overwhelmed by your workload?: 2
Enter value for Is your workload generally manageable?: 2
Enter value for Do you frequently work beyond your scheduled hours?: 2
Enter value for Do you feel supported by your manager and team?: 2
Enter value for Are you satisfied with the resources and help available at work?: 2
Enter value for Do you often feel isolated when dealing with work challenges?: 2
Enter value for Do you frequently think about work in your personal time?: 2
Enter value for Do you feel you have a healthy work-life balance?: 2
Enter value for Does your work often interfere with personal or family time?: 2
Enter value for Do you experience symptoms like sleep issues anxiety or low mood?: 2
Enter value for Do you regularly engage in physical or leisure activities?: 2
Enter value for How do you feel?: 2


In [10]:
# Exclude non-numeric columns from PCA
numeric_cols = X.select_dtypes(include=['number']).columns
X_numeric = X[numeric_cols]
user_data_numeric = user_data[numeric_cols]

In [11]:
# PCA for dimensionality reduction
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
user_data_pca = pca.transform(user_data)

In [12]:
# Base models
estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('cat', CatBoostClassifier(verbose=0)),
    ('lgb', LGBMClassifier())
]

In [13]:
# Stacking model
stack_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))
stack_model.fit(X_pca, y)
y_pred = stack_model.predict(user_data_pca)

print(f'Predicted class: {y_pred}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 1
[LightGBM] [Info] Start training from score -1.666008
[LightGBM] [Info] Start training from score -1.634756
[LightGBM] [Info] Start training from score -1.518684
[LightGBM] [Info] Start training from score -1.647659
[LightGBM] [Info] Start training from score -1.587187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 1
[LightGBM] [Info] Start training from score -1.664026
[LightGBM] [Info] Start training from sc