In [1]:
import pandas as pd
from scipy import io
import numpy as np

In [2]:
arff_file = io.arff.loadarff('phpMawTba.arff')

In [3]:
adult_census = pd.DataFrame(arff_file[0])

# convert bytes columns to strings

str_df = adult_census.select_dtypes([object])
str_df = str_df.stack().str.decode('utf-8').unstack()

for col in str_df:
    adult_census[col] = str_df[col]

In [5]:
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns = [target_name, "education-num"])

In [6]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include = object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

model = make_pipeline(
    OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1), LogisticRegression(max_iter = 500)
)

In [17]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data_categorical, target, error_score = "raise")
scores = cv_results["test_score"]
print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")

The accuracy is: 0.755 ± 0.002


In [19]:
from sklearn.preprocessing import OneHotEncoder

model = make_pipeline(
    OneHotEncoder(handle_unknown = "ignore"), LogisticRegression(max_iter = 500)
)
cv_results = cross_validate(model, data_categorical, target, error_score = "raise")
scores = cv_results["test_score"]
print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")

The accuracy is: 0.833 ± 0.003


We get a higher score using the OneHotEncoder over the OrdinalEncoder. This means using an encoder that assumes an order in categories leads to a worse result. We should only use OrdinalEncoder with linear models when the categorical features have a specific ordering.