In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn import set_config

set_config(display='diagram')

## Load Data

In [2]:
penguins = pd.read_csv("penguins.csv")

In [3]:
X = penguins.drop("species", axis=1)

In [5]:
y = LabelEncoder().fit_transform(penguins['species'])

## Create Pipeline

In [6]:
ct = ColumnTransformer([
    ("category", OrdinalEncoder(), ['island', 'gender']),
    ("numerical", 'passthrough',
    ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm',
     'body_mass_g'])
])

In [7]:
clf = Pipeline([
    ("preprocess", ct),
    ("random_forest", RandomForestClassifier())
])

In [8]:
clf

## Train and Evaluate Model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, stratify=y)

In [11]:
_ = clf.fit(X_train, y_train)

In [12]:
clf.score(X_test, y_test)

0.9761904761904762

## Serialize model

In [13]:
import joblib

In [14]:
joblib.dump(clf, "penguin_clf.joblib")

['penguin_clf.joblib']