# Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline
from xgboost import XGBClassifier

import joblib
import numpy
import seaborn

df = seaborn.load_dataset("titanic")

continuous_cols = ["age", "fare", "parch", "sibsp"]
categorical_cols = ["embarked", "pclass", "sex", "deck"]

X = df[continuous_cols + categorical_cols]
y = df["survived"]

transformer = ColumnTransformer(
    [(continuous_col, ContinuousDomain(dtype = numpy.float32), [continuous_col]) for continuous_col in continuous_cols] +
    [(categorical_col, CategoricalDomain(dtype = "category"), [categorical_col]) for categorical_col in categorical_cols]
)
transformer.set_output(transform = "pandas")

classifier = XGBClassifier(enable_categorical = True, random_state = 42)

pipeline = PMMLPipeline([
    ("transformer", transformer),
    ("classifier", classifier)
])
pipeline.fit(X, y)

joblib.dump(pipeline, "resources/XGBostTitanic.pkl")

# Dataset

In [None]:
X.to_excel("resources/Titanic.xlsx", index = False)

# PMML

In [None]:
from sklearn2pmml import sklearn2pmml

sklearn2pmml(pipeline, "resources/XGBoostTitanic.pmml")