In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import set_config

set_config(display='diagram')

## Load Data

In [7]:
penguins = pd.read_csv("penguins.csv",
                       dtype={"species": 'category',
                              "island": 'category',
                              "gender": 'category'})

In [8]:
X = penguins.drop("species", axis=1)

In [9]:
y = penguins['species'].cat.codes

## Create Pipeline

In [10]:
ct = ColumnTransformer([
    ("category", OrdinalEncoder(), ['island', 'gender']),
    ("numerical", 'passthrough',
    ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm',
     'body_mass_g'])
])

In [11]:
clf = Pipeline([
    ("preprocess", ct),
    ("random_forest", RandomForestClassifier())
])

In [18]:
clf

## Train and Evaluate Model

In [20]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, stratify=y)

In [27]:
_ = clf.fit(X_train, y_train)

In [28]:
clf.score(X_test, y_test)

0.9761904761904762

## Serialize model

In [29]:
import joblib

In [30]:
joblib.dump(clf, "penguin_clf.joblib")

['penguin_clf.joblib']