# Feature Engineering

## 1. Import Packages and Data Loading

In [458]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)

In [459]:
df = pd.read_csv(r"marital_satisfaction_data_cleaned.csv")
df.head()

Unnamed: 0,country,sex,age,marriage_duration,num_children,num_brought_up_children,education,physio,religion,religiosity,safety,love1,happy,esteem1,love2,love3,esteem2,sact,love4,love5,ms,globe1,globe2,globe3,globe4,globe5,globe6,globe7,globe8
0,Brazil,1.0,21.0,2.0,0,0,5,3,1,4,4,5,5,5,5,4,4,5,5,5,7,7,7,7,7,7,7,7,7
1,Brazil,1.0,29.0,3.0,1,0,5,3,1,6,5,5,5,5,5,5,5,5,4,5,6,6,7,7,7,7,7,7,7
2,Brazil,1.0,30.0,7.0,0,0,5,3,1,4,4,5,5,4,5,5,5,5,3,5,7,6,7,6,7,7,7,7,7
3,Brazil,1.0,30.0,7.0,1,1,5,3,1,6,7,5,5,5,5,5,5,4,4,5,6,5,7,7,6,7,7,7,7
4,Brazil,1.0,28.0,9.0,0,0,4,2,1,5,7,5,4,5,5,5,5,5,4,5,6,5,6,5,6,7,7,7,7


In [460]:
df_feature = df.copy()

## 2. Feature Engineering

### 2.1 Feature Extraction

**Group country into continent**

In [461]:
df_feature["country"].unique()

array(['Brazil ', 'Bulgaria', 'Canada', 'China', 'Croatia', 'Estonia',
       'Germany', 'Ghana', 'Greece', 'HongKong', 'Hungary', 'India',
       'Indonesia', 'Iran ', 'Italy', 'Kazakhstan', 'Kenia', 'Malaysia',
       'Mexico', 'Nigeria ', 'Pakistan', 'Poland ', 'Portugal', 'Romania',
       'Russia', 'Saudi Arabia', 'Slovakia', 'South Korea', 'Spain ',
       'Switzerland', 'Turkey ', 'U.K.', 'Uganda'], dtype=object)

In [462]:
# Mapping dictionary
country_to_continent = {
    'Brazil': 'South America',
    'Bulgaria': 'Europe',
    'Canada': 'North America',
    'China': 'Asia',
    'Croatia': 'Europe',
    'Estonia': 'Europe',
    'Germany': 'Europe',
    'Ghana': 'Africa',
    'Greece': 'Europe',
    'HongKong': 'Asia',
    'Hungary': 'Europe',
    'India': 'Asia',
    'Indonesia': 'Asia',
    'Iran': 'Asia',
    'Italy': 'Europe',
    'Kazakhstan': 'Asia',
    'Kenia': 'Africa',
    'Malaysia': 'Asia',
    'Mexico': 'North America',
    'Nigeria': 'Africa',
    'Pakistan': 'Asia',
    'Poland': 'Europe',
    'Portugal': 'Europe',
    'Romania': 'Europe',
    'Russia': 'Europe',
    'Saudi Arabia': 'Asia',
    'Slovakia': 'Europe',
    'South Korea': 'Asia',
    'Spain': 'Europe',
    'Switzerland': 'Europe',
    'Turkey': 'Asia',
    'U.K.': 'Europe',
    'Uganda': 'Africa',
}

In [463]:
df_feature["country"] = df_feature["country"].str.strip()
df_feature["continent"] = df_feature["country"].map(country_to_continent)
df_feature = df_feature.drop(columns=["country", "continent", "religion"])
df_feature.head()

Unnamed: 0,sex,age,marriage_duration,num_children,num_brought_up_children,education,physio,religiosity,safety,love1,happy,esteem1,love2,love3,esteem2,sact,love4,love5,ms,globe1,globe2,globe3,globe4,globe5,globe6,globe7,globe8
0,1.0,21.0,2.0,0,0,5,3,4,4,5,5,5,5,4,4,5,5,5,7,7,7,7,7,7,7,7,7
1,1.0,29.0,3.0,1,0,5,3,6,5,5,5,5,5,5,5,5,4,5,6,6,7,7,7,7,7,7,7
2,1.0,30.0,7.0,0,0,5,3,4,4,5,5,4,5,5,5,5,3,5,7,6,7,6,7,7,7,7,7
3,1.0,30.0,7.0,1,1,5,3,6,7,5,5,5,5,5,5,4,4,5,6,5,7,7,6,7,7,7,7
4,1.0,28.0,9.0,0,0,4,2,5,7,5,4,5,5,5,5,5,4,5,6,5,6,5,6,7,7,7,7


### 2.2 Drop Redundant Features

In [464]:
# Your grouped columns
groups = {
    "love": ['love1', 'love2', 'love3', 'love4', 'love5'],
    "esteem": ['esteem1', 'esteem2'],
    "globe": ['globe1', 'globe2', 'globe3', 'globe4', 'globe5', 'globe6', 'globe7', 'globe8']
}

to_drop = []
selected = {}

# For each group, compute Spearman correlation with 'ms'
for name, cols in groups.items():
    corrs = {}

    for col in cols:
        # Drop NaNs for pairwise correlation
        valid_data = df_feature[[col, 'ms']]
        corr = valid_data[col].corr(valid_data['ms'], method='spearman')
        corrs[col] = abs(corr)  # use absolute correlation

    # Sort and select best
    sorted_corrs = sorted(corrs.items(), key=lambda x: x[1], reverse=True)
    best = sorted_corrs[0][0]
    print(f"Best variable for {name} (correlation with ms): {best} → {sorted_corrs[0][1]:.4f}")
    
    selected[name] = best

    # Drop all others
    to_drop.extend([col for col in cols if col != best])

# Drop other columns
df_feature.drop(columns=to_drop, inplace=True)

# Optional: Rename selected columns to group name
df_feature.rename(columns={v: k for k, v in selected.items()}, inplace=True)

df_feature.head()


Best variable for love (correlation with ms): love4 → 0.5133
Best variable for esteem (correlation with ms): esteem1 → 0.4944
Best variable for globe (correlation with ms): globe1 → 0.1117


Unnamed: 0,sex,age,marriage_duration,num_children,num_brought_up_children,education,physio,religiosity,safety,happy,esteem,sact,love,ms,globe
0,1.0,21.0,2.0,0,0,5,3,4,4,5,5,5,5,7,7
1,1.0,29.0,3.0,1,0,5,3,6,5,5,5,5,4,6,6
2,1.0,30.0,7.0,0,0,5,3,4,4,5,4,5,3,7,6
3,1.0,30.0,7.0,1,1,5,3,6,7,5,5,4,4,6,5
4,1.0,28.0,9.0,0,0,4,2,5,7,4,5,5,4,6,5


### 2.2 Column Transformer

In [465]:
X = df_feature.drop("ms", axis=1)
y = df_feature["ms"]

In [466]:
# Create Column Transformer with 3 types of transformers
ordinal_columns = ["education", "religiosity", "physio", "safety", "happy", "sact", "love", "esteem", "globe"]

num_features = ['age']
# label_columns = ['country','continent', 'religion']
transform_columns= ['num_children','num_brought_up_children']

from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

numeric_transformer = StandardScaler()

transform_pipe = Pipeline(steps=[
    ('power', PowerTransformer(method='yeo-johnson'))
])

ordinal_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('power', transform_pipe, transform_columns),
        ('ordinal', ordinal_transformer, ordinal_columns),
        # ('labels', ordinal_transformer, label_columns)  # treat nominal as ordinal to keep RFE clean
    ]
)

In [467]:
X = preprocessor.fit_transform(X)

In [468]:
all_columns = num_features + transform_columns + ordinal_columns
# all_columns = num_features + transform_columns + ordinal_columns + label_columns

X = pd.DataFrame(X, columns=all_columns)
X = X.drop(columns=["happy"])

for col in X.columns:
    print(col, ":")
    print(X[col].unique())

age :
[-1.72207582 -1.02311165 -0.93574113 -1.11048217 -0.14940645 -0.06203593
  0.20007564 -0.58625905  0.98641032 -0.67362957 -0.76100009  0.63692824
  0.54955772 -1.45996426 -0.32414749 -0.84837061 -1.89681686 -1.28522322
  1.68537449  0.11270512 -1.37259374 -1.1978527  -0.49888853  1.16115136
 -0.23677697 -1.54733478  1.42326293 -1.80944634  1.59800397  1.07378084
 -1.6347053   0.0253346  -0.41151801  0.37481668  0.4621872   1.24852189
  1.51063345  0.72429876  1.94748605  0.28744616  0.81166928  2.20959761
  2.03485657  2.12222709  2.38433866  1.77274501  0.8990398   1.86011553
  2.5590797   1.33589241  2.29696813  3.69489647  2.47170918  3.60752595
  2.90856178  3.17067334  3.4327849   3.25804386  2.9959323   3.34541438
  2.73382074  2.64645022  2.82119126  4.13174907 -1.98418738  3.08330282
  3.95700803  4.04437855  3.86963751  3.52015542  3.78226699 -2.0715579
 -0.19309171]
num_children :
[-1.74321443 -0.49712714  0.34679309  1.00378045  2.43670241  1.54948932
  3.15365802  2.0

In [469]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.DataFrame(y, columns=["ms"])

### 2.3 Data Splitting

In [470]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [471]:
X_train

Unnamed: 0,age,num_children,num_brought_up_children,education,religiosity,physio,safety,sact,love,esteem,globe
1095,0.636928,0.346793,0.625567,2.0,3.0,2.0,2.0,4.0,4.0,4.0,3.0
3882,-1.197853,-1.743214,-1.389057,4.0,2.0,0.0,1.0,4.0,4.0,4.0,6.0
3662,0.724299,3.153658,-1.389057,3.0,5.0,2.0,2.0,4.0,4.0,4.0,5.0
2899,-0.761000,-1.743214,-1.389057,3.0,3.0,3.0,4.0,2.0,3.0,2.0,6.0
1630,-0.498889,1.549489,1.573808,3.0,6.0,2.0,4.0,4.0,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...
3772,0.200076,0.346793,0.625567,2.0,3.0,2.0,4.0,2.0,2.0,2.0,6.0
5191,1.510633,0.346793,-1.389057,2.0,3.0,3.0,3.0,4.0,4.0,4.0,5.0
5226,1.073781,-0.497127,-1.389057,2.0,2.0,2.0,1.0,3.0,3.0,3.0,6.0
5390,-0.062036,1.003780,1.158685,4.0,2.0,2.0,1.0,4.0,3.0,4.0,4.0


In [472]:
len(X_train.columns)

11

### 2.4 RFE

https://www.analyticsvidhya.com/blog/2023/11/train-test-validation-split/

In [473]:
# # Prepare a result list
# results = []

# # Add all features first (without RFE)
# clf = RandomForestClassifier(random_state=42)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_val)
# acc = accuracy_score(y_val, y_pred)
# results.append({
#     "Top-K Features": "All ({})".format(X.shape[1]),
#     "Accuracy": acc,
#     "Features": list(X.columns)
# })

# # RFE for top-k features
# for k in [15, 10, 8, 6, 5, 3]:
#     if k > X.shape[1]:  # Skip if k exceeds the number of features
#         continue

#     selector = RFE(RandomForestClassifier(random_state=42), n_features_to_select=k)
#     selector.fit(X, y)

#     selected_features = X.columns[selector.support_]

#     X_train_k, X_val_k, y_train_k, y_val_k = train_test_split(
#         X[selected_features], y, test_size=0.2, random_state=42
#     )
#     clf_k = RandomForestClassifier(random_state=42)
#     clf_k.fit(X_train_k, y_train_k)
#     y_pred_k = clf_k.predict(X_val_k)
#     acc_k = accuracy_score(y_val_k, y_pred_k)

#     results.append({
#         "Top-K Features": k,
#         "Accuracy": acc_k,
#         "Features": list(selected_features)
#     })

# # Sort results by accuracy descending
# results = sorted(results, key=lambda x: x["Accuracy"], reverse=True)

# # Print out results line by line
# for res in results:
#     print(f"Top-K Features: {res['Top-K Features']}")
#     print(f"Accuracy: {res['Accuracy']:.4f}")
#     print(f"Features: {res['Features']}")
#     print("-" * 50)

**Select best reduced feature k:**

Top-K Features: 10

Accuracy: 0.4737

Features: ['age', 'num_children', 'num_brought_up_children', 'education', 'religiosity', 'physio', 'safety', 'sact', 'love', 'globe']

--------------------------------------------------

Top-K Features: All (11)

Accuracy: 0.4678

Features: ['age', 'num_children', 'num_brought_up_children', 'education', 'religiosity', 'physio', 'safety', 'sact', 'love', 'esteem', 'globe']

--------------------------------------------------

Top-K Features: 8

Accuracy: 0.4510

Features: ['age', 'num_children', 'num_brought_up_children', 'religiosity', 'safety', 'sact', 'love', 'globe']

--------------------------------------------------

Top-K Features: 6

Accuracy: 0.4091

Features: ['age', 'num_brought_up_children', 'religiosity', 'safety', 'love', 'globe']

--------------------------------------------------

Top-K Features: 5

Accuracy: 0.3835

Features: ['age', 'num_brought_up_children', 'religiosity', 'safety', 'love']

--------------------------------------------------

Top-K Features: 3

Accuracy: 0.3331

Features: ['age', 'num_brought_up_children', 'safety']

--------------------------------------------------


In [474]:
reduced_features = ['age', 'num_brought_up_children', 'religiosity', 'safety', 'love', 'globe']
# reduced_features = X_train.columns
X_train = X_train[reduced_features]
X_val = X_val[reduced_features]
X_test = X_test[reduced_features]

In [475]:
X_train.head()

Unnamed: 0,age,num_brought_up_children,religiosity,safety,love,globe
1095,0.636928,0.625567,3.0,2.0,4.0,3.0
3882,-1.197853,-1.389057,2.0,1.0,4.0,6.0
3662,0.724299,-1.389057,5.0,2.0,4.0,5.0
2899,-0.761,-1.389057,3.0,4.0,3.0,6.0
1630,-0.498889,1.573808,6.0,4.0,2.0,6.0


In [426]:
# ohe_columns = ['country', 'religion']

In [476]:
X_val.head()

Unnamed: 0,age,num_brought_up_children,religiosity,safety,love,globe
1803,0.89904,1.158685,6.0,1.0,4.0,5.0
5253,-0.67363,-0.121882,3.0,2.0,4.0,6.0
472,-0.149406,0.625567,3.0,2.0,3.0,6.0
5931,-0.498889,0.625567,0.0,0.0,3.0,5.0
3950,0.724299,0.625567,2.0,5.0,1.0,5.0


### 2.5 SMOTE

In [477]:
# 1. Train without SMOTE
clf_no_smote = RandomForestClassifier(random_state=42)
clf_no_smote.fit(X_train, y_train)
y_pred_no_smote = clf_no_smote.predict(X_val)
acc_no_smote = accuracy_score(y_val, y_pred_no_smote)

print(f"Accuracy without SMOTE: {acc_no_smote:.4f}")

# 2. Train with SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

clf_smote = RandomForestClassifier(random_state=42)
clf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = clf_smote.predict(X_val)
acc_smote = accuracy_score(y_val, y_pred_smote)

print(f"Accuracy with SMOTE: {acc_smote:.4f}")

Accuracy without SMOTE: 0.4167
Accuracy with SMOTE: 0.4062


**Pick wihout smote**

### 2.6 Feature Scalling

In [478]:
X_train.columns

Index(['age', 'num_brought_up_children', 'religiosity', 'safety', 'love',
       'globe'],
      dtype='object')

In [479]:
# --- Step 1: Define binning schemes (hierarchical, from 0-based) ---
binning_steps = {
    '7-class': {i: i for i in range(7)},  # raw
    '5-class': {0:0, 1:1, 2:1, 3:2, 4:3, 5:3, 6:4},
    '3-class': {0:0, 1:0, 2:1, 3:2, 4:2, 5:2, 6:2},  # extended mapping to avoid unmapped error
    'binary':  {0:0, 1:0, 2:1, 3:1, 4:1, 5:1, 6:1}   # extended mapping to avoid unmapped error
}

# --- Step 2: Evaluation function ---
def evaluate_model(X_train, y_train, X_val, y_val, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

# --- Step 3: Evaluate sequentially and update labels ---
results = []
y_train_versions = {}
y_val_versions = {}
y_test_versions = {}

# Start with original y_train, y_val
y_train_base = y_train["ms"].copy()
y_val_base = y_val["ms"].copy()
y_test_base = y_test["ms"].copy()

for scheme_name in ['7-class', '5-class', '3-class', 'binary']:
    bin_map = binning_steps[scheme_name]

    # Map train and val labels based on current labels (which update each iteration)
    y_train_binned = y_train_base.map(bin_map)
    y_val_binned = y_val_base.map(bin_map)
    y_test_binned = y_test_base.map(bin_map)

    y_train_binned = y_train_binned.astype(int)
    y_val_binned = y_val_binned.astype(int)
    y_test_binned = y_test_binned.astype(int)

    # Train and evaluate model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    acc_val = evaluate_model(X_train, y_train_binned, X_val, y_val_binned, model)
    acc_test = evaluate_model(X_train, y_train_binned, X_test, y_test_binned, model)

    # Store results
    results.append({
        'Scheme': scheme_name,
        'Num_Classes': len(set(bin_map.values())),
        'Validation Accuracy': acc_val,
        'Test Accuracy': acc_test
    })

    # Store the binned versions after evaluation
    y_train_versions[scheme_name] = y_train_binned
    y_val_versions[scheme_name] = y_val_binned
    y_test_versions[scheme_name] = y_test_binned

    # Update base labels for next iteration
    y_train_base = y_train_binned
    y_val_base = y_val_binned
    y_test_base = y_test_binned

# Final: show sorted results
results_df = pd.DataFrame(results).sort_values('Validation Accuracy', ascending=False)
print(results_df)

    Scheme  Num_Classes  Validation Accuracy  Test Accuracy
3   binary            2             0.828598       0.852273
2  3-class            3             0.815341       0.838068
1  5-class            5             0.522727       0.533144
0  7-class            7             0.416667       0.410985


In [480]:
y_train = y_train_versions['binary']
y_val = y_val_versions['binary']
y_test = y_test_versions['binary']

In [481]:
X_val

Unnamed: 0,age,num_brought_up_children,religiosity,safety,love,globe
1803,0.899040,1.158685,6.0,1.0,4.0,5.0
5253,-0.673630,-0.121882,3.0,2.0,4.0,6.0
472,-0.149406,0.625567,3.0,2.0,3.0,6.0
5931,-0.498889,0.625567,0.0,0.0,3.0,5.0
3950,0.724299,0.625567,2.0,5.0,1.0,5.0
...,...,...,...,...,...,...
3551,-1.285223,-1.389057,4.0,2.0,4.0,6.0
6425,0.636928,-0.121882,3.0,1.0,4.0,5.0
6511,-0.935741,-1.389057,3.0,4.0,2.0,6.0
4567,0.025335,1.158685,4.0,1.0,2.0,3.0


In [482]:
y_val

1803    1
5253    1
472     1
5931    1
3950    0
       ..
3551    1
6425    1
6511    1
4567    1
465     1
Name: ms, Length: 1056, dtype: int64

In [483]:
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [484]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_installed = True
except ImportError:
    xgb_installed = False

models = [
    {
        "name": "LogisticRegression",
        "estimator": LogisticRegression(max_iter=1000),
        "param_grid": {
            "C": [0.05, 0.1, 1, 5, 10],  # more granularity
            "penalty": ['l2'],  
            "solver": ['lbfgs']
        }
    },
    {
        "name": "RandomForest",
        "estimator": RandomForestClassifier(random_state=42),
        "param_grid": {
            "n_estimators": [50, 75, 100],  # add 75
            "max_depth": [None, 8, 12],     # add 8 and 12
            "min_samples_split": [2, 4, 6], # add 4 and 6
            "max_features": ['sqrt', 'log2'] # add log2 option
        }
    },
    {
        "name": "GradientBoosting",
        "estimator": GradientBoostingClassifier(random_state=42),
        "param_grid": {
            "n_estimators": [50, 75, 100],
            "learning_rate": [0.01, 0.05, 0.1], # added 0.05
            "max_depth": [3, 4, 5]               # added 4
        }
    },
    {
        "name": "KNeighbors",
        "estimator": KNeighborsClassifier(),
        "param_grid": {
            "n_neighbors": [3, 4, 5],   # added 4
            "weights": ['uniform', 'distance']  # added distance weighting
        }
    },
    {
        "name": "SVC",
        "estimator": SVC(),
        "param_grid": {
            "C": [0.5, 1, 5, 10],   # added 0.5 and 5
            "kernel": ["linear", "rbf"],
            "gamma": ['scale', 'auto']  # added auto
        }
    },
    {
        "name": "GaussianNB",
        "estimator": GaussianNB(),
        "param_grid": {}
    },
    {
        "name": "DecisionTree",
        "estimator": DecisionTreeClassifier(random_state=42),
        "param_grid": {
            "max_depth": [None, 8, 12],
            "min_samples_split": [2, 4, 6]
        }
    },
    {
        "name": "AdaBoost",
        "estimator": AdaBoostClassifier(random_state=42),
        "param_grid": {
            "n_estimators": [50, 75, 100],
            "learning_rate": [0.01, 0.05, 0.1]
        }
    }
]

if xgb_installed:
    models.append({
        "name": "XGBoost",
        "estimator": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "param_grid": {
            "n_estimators": [50, 75, 100],
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": [3, 4, 5]
        }
    })

# Store results
results = []

# Loop over each model
for m in models:
    print(f"Training {m['name']}...")
    grid = GridSearchCV(estimator=m["estimator"],
                        param_grid=m["param_grid"],
                        cv=5,
                        scoring="accuracy",
                        n_jobs=-1)
    grid.fit(X_train, y_train.ravel())
    best_model = grid.best_estimator_

    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)
    y_test_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)

    results.append({
        "Model": m["name"],
        "Best Params": grid.best_params_,
        "Validation Accuracy": val_acc,
        "Test Accuracy": test_acc
    })

# Show results
results_df = pd.DataFrame(results).sort_values(by="Test Accuracy", ascending=False)
print(results_df.to_string(index=False))

Training LogisticRegression...
Training RandomForest...
Training GradientBoosting...
Training KNeighbors...
Training SVC...
Training GaussianNB...
Training DecisionTree...
Training AdaBoost...
Training XGBoost...
             Model                                                                           Best Params  Validation Accuracy  Test Accuracy
LogisticRegression                                       {'C': 0.05, 'penalty': 'l2', 'solver': 'lbfgs'}             0.839962       0.867424
      RandomForest {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 100}             0.840909       0.865530
        GaussianNB                                                                                    {}             0.839015       0.862689
           XGBoost                           {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50}             0.834280       0.862689
               SVC                                            {'C': 1, 'gamma': 'a

In [498]:
sample_input = pd.DataFrame([{
    'age': 35,
    'num_brought_up_children': 2,
    'religiosity': 3,
    'safety': 4,
    'love': 3,
    'globe': 3
}])

prediction = best_model.predict(sample_input)[0]
proba = best_model.predict_proba(sample_input)

print("Predicted class:", prediction)
print("Prediction probabilities:", proba)

Predicted class: 1
Prediction probabilities: [[0.09200621 0.9079938 ]]
