In [1]:
import pandas as pd
from scipy import io
import numpy as np

In [2]:
arff_file = io.arff.loadarff('phpMawTba.arff')

In [3]:
adult_census = pd.DataFrame(arff_file[0])

# convert bytes columns to strings

str_df = adult_census.select_dtypes([object])
str_df = str_df.stack().str.decode('utf-8').unstack()

for col in str_df:
    adult_census[col] = str_df[col]

In [5]:
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns = [target_name, "fnlwgt"])

In [6]:
# column selector that selects based on whether column is numerical or categorical

from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude = object)
categorical_columns_selector = selector(dtype_include = object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [7]:
# import and name preprocessors

from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown = "ignore")
numerical_preprocessor = StandardScaler()

In [8]:
# create ColumnTransformer

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

In [9]:
# combine the ColumnTransformer with classifier in pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter = 500))
model

In [10]:
# split data into training and testing

from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state = 42
)

In [11]:
_ = model.fit(data_train, target_train)

In [12]:
data_test.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
7762,56.0,Private,HS-grad,9.0,Divorced,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States
23881,25.0,Private,HS-grad,9.0,Married-civ-spouse,Transport-moving,Own-child,Other,Male,0.0,0.0,40.0,United-States
30507,43.0,Private,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,14344.0,0.0,40.0,United-States
28911,32.0,Private,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,40.0,United-States
19484,39.0,Private,Bachelors,13.0,Married-civ-spouse,Sales,Wife,White,Female,0.0,0.0,30.0,United-States


In [13]:
model.predict(data_test)[:5]

array(['<=50K', '<=50K', '>50K', '<=50K', '>50K'], dtype=object)

In [14]:
target_test[:5]

7762     <=50K
23881    <=50K
30507     >50K
28911    <=50K
19484    <=50K
Name: class, dtype: object

In [15]:
model.score(data_test, target_test)

0.8576693145524527

In [16]:
#evaluate model using cross-validation

from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv = 5)
cv_results

{'fit_time': array([0.16946626, 0.1593709 , 0.17164207, 0.16381788, 0.15777302]),
 'score_time': array([0.01809192, 0.01810908, 0.01815009, 0.01874709, 0.01783514]),
 'test_score': array([0.85136657, 0.8498311 , 0.84766585, 0.85288698, 0.85554873])}

In [17]:
scores = cv_results["test_score"]
print(
    "The mean cross_validation accuracy is: "
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

The mean cross_validation accuracy is: 0.851 ± 0.003


In [19]:
# use more complex gradient-boosting tree model to see if higher predictive performance can be achieved

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(
    handle_unknown = "use_encoded_value", unknown_value = -1
)

preprocessor = ColumnTransformer(
    [("categorical", categorical_preprocessor, categorical_columns)],
    remainder = "passthrough",
)

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

In [20]:
%%time
_ = model.fit(data_train, target_train)

CPU times: user 3.3 s, sys: 1.22 s, total: 4.52 s
Wall time: 504 ms


In [21]:
model.score(data_test, target_test)

0.8799443125051183