In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path
import sklearn

sklearn.set_config(display='diagram')

## Load Data

In [5]:
categorical_columns = ['island', 'gender']
numerical_columns = [
    'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'
]
feature_names = categorical_columns + numerical_columns

In [6]:
penguins = pd.read_csv(Path("media") / "penguins.csv")

In [7]:
X = penguins[feature_names]
y = penguins['species']

## Create Pipeline

In [8]:
ct = ColumnTransformer([
    ("category", OrdinalEncoder(), categorical_columns),
    ("numerical", 'passthrough', numerical_columns)
])

In [9]:
clf = Pipeline([
    ("preprocess", ct),
    ("random_forest", RandomForestClassifier(random_state=42))
])

In [10]:
clf

## Train and Evaluate Model

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, stratify=y)

In [13]:
_ = clf.fit(X_train, y_train)

In [14]:
clf.score(X_test, y_test)

0.9880952380952381

## Serialize model

In [18]:
from pathlib import Path
import joblib

In [16]:
_ = clf.fit(X, y)

In [19]:
media_dir = Path("media")
joblib.dump(clf, media_dir / "penguin_clf.joblib")

['media/penguin_clf.joblib']