# Stacking with Xgboost, Catboost and LGB with Random forest as meta model without PCA

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
data = pd.read_csv('Final_GP_Dataset5.csv')

# Replace special JSON characters in column names
data.columns = [col.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace(':', '').replace(',', '') for col in data.columns]

# Preprocess data
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data.drop('Do you regularly feel physically or emotionally exhausted?', axis=1)
y = data['Do you regularly feel physically or emotionally exhausted?']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('cat', CatBoostClassifier(verbose=0)),
    ('lgb', LGBMClassifier())
]

# Stacking model without PCA
stack_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))
stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy without PCA: {accuracy}')
print(f'Precision without PCA: {precision}')
print(f'Recall without PCA: {recall}')
print(f'F1 Score without PCA: {f1}')


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 19
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from score -1.618857
[LightGBM] [Info] Start training from score -1.551759
[LightGBM] [Info] Start training from score -1.593934
[LightGBM] [Info] Start training from score -1.581701
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 19
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from 

# Stacking with Xgboost, Catboost and LGB with Random forest as meta model with PCA

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
data = pd.read_csv('Final_GP_Dataset5.csv')

# Replace special JSON characters in column names
data.columns = [col.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace(':', '').replace(',', '') for col in data.columns]

# Preprocess data
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data.drop('Do you regularly feel physically or emotionally exhausted?', axis=1)
y = data['Do you regularly feel physically or emotionally exhausted?']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('cat', CatBoostClassifier(verbose=0)),
    ('lgb', LGBMClassifier())
]

# Stacking model without PCA
stack_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=42))
stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy without PCA: {accuracy}')
print(f'Precision without PCA: {precision}')
print(f'Recall without PCA: {recall}')
print(f'F1 Score without PCA: {f1}')


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 19
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from score -1.618857
[LightGBM] [Info] Start training from score -1.551759
[LightGBM] [Info] Start training from score -1.593934
[LightGBM] [Info] Start training from score -1.581701
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 19
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from 

 # Stacking with Xgboost, Catboost and LGB with Bagging with Support Vector machine as meta model without PCA

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.classifier import StackingClassifier as MLXStackingClassifier

# Load and preprocess data
data = pd.read_csv('Final_GP_Dataset5.csv')
data.columns = [col.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace(':', '').replace(',', '') for col in data.columns]

# Label encoding for all categorical features
for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

# Adjusting label encoding for the target to start from 0
target_col = 'Do you regularly feel physically or emotionally exhausted?'
data[target_col] = LabelEncoder().fit_transform(data[target_col])

# Define features and target
X = data.drop(target_col, axis=1)
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
cat = CatBoostClassifier(verbose=0)
lgb = LGBMClassifier()

# Meta-model
svm = SVC()

# Stacking model using MLXtend
stack = MLXStackingClassifier(classifiers=[xgb, cat, lgb], meta_classifier=svm, use_probas=False, average_probas=False)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 19
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from score -1.618857
[LightGBM] [Info] Start training from score -1.551759
[LightGBM] [Info] Start training from score -1.593934
[LightGBM] [Info] Start training from score -1.581701
Accuracy: 0.2525
Precision: 0.28438980737718467
Recall: 0.2550221392002214
F1 Score: 0.24462004809895013


#  Stacking with Xgboost, Catboost and LGB with Bagging with Support Vector machine as meta model with PCA

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.classifier import StackingClassifier as MLXStackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Load and preprocess data
data = pd.read_csv('Final_GP_Dataset5.csv')
data.columns = [col.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace(':', '').replace(',', '') for col in data.columns]

# Label encoding for all categorical features
for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

# Adjusting label encoding for the target to start from 0
target_col = 'Do you regularly feel physically or emotionally exhausted?'
data[target_col] = LabelEncoder().fit_transform(data[target_col])

# Define features and target
X = data.drop(target_col, axis=1)
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PCA for dimensionality reduction
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Base models
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
cat = CatBoostClassifier(verbose=0)
lgb = LGBMClassifier()

# Meta-model
svm = SVC()

# Stacking model using MLXtend with PCA
stack_pca = MLXStackingClassifier(classifiers=[xgb, cat, lgb], meta_classifier=svm, use_probas=False, average_probas=False)
stack_pca.fit(X_train_pca, y_train)
y_pred_pca = stack_pca.predict(X_test_pca)

# Calculate metrics
accuracy_pca = accuracy_score(y_test, y_pred_pca)
precision_pca = precision_score(y_test, y_pred_pca, average='macro')
recall_pca = recall_score(y_test, y_pred_pca, average='macro')
f1_pca = f1_score(y_test, y_pred_pca, average='macro')

print(f'Accuracy with PCA: {accuracy_pca}')
print(f'Precision with PCA: {precision_pca}')
print(f'Recall with PCA: {recall_pca}')
print(f'F1 Score with PCA: {f1_pca}')


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 1
[LightGBM] [Info] Start training from score -1.707878
[LightGBM] [Info] Start training from score -1.618857
[LightGBM] [Info] Start training from score -1.551759
[LightGBM] [Info] Start training from score -1.593934
[LightGBM] [Info] Start training from score -1.581701
Accuracy with PCA: 0.225
Precision with PCA: 0.22648771304168172
Recall with PCA: 0.2318022692680227
F1 Score with PCA: 0.22560674558015864
