### Introduction
This notebook provides an example for how to use the PAKKR library in a training and validation pipeline using Fisher's iris dataset.

### Setup
Install the packages required for this example

In [None]:
%pip install numpy pandas scikit-learn

In [2]:
from typing import Callable, Dict, NamedTuple, List, Union, Tuple

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from pakkr import returns, Pipeline

In [3]:
class IrisData(NamedTuple):
    data: np.ndarray
    target: np.ndarray
    target_names: np.ndarray
    feature_names: List[str]

In [4]:
TestSize = float

### Defining the steps

In [5]:
@returns(stratified_sampler=StratifiedShuffleSplit)
def initialise_sampler(test_size) -> Dict[str, StratifiedShuffleSplit]:
    """
    Saves the sampler into the meta to be consumed by a later step.
    """
    return {
        "stratified_sampler": StratifiedShuffleSplit(n_splits=1, test_size=test_size)
    }

In [6]:
def load_iris_data() -> IrisData:
    iris = datasets.load_iris()
    return IrisData(**{k: iris[k] for k in IrisData._fields})

In [7]:
# This annotation informs PAKKR that this step returns two objects, rather than a tuple of two objects
@returns(pd.DataFrame, pd.Series)
def convert_to_pandas(iris_data: IrisData) -> Tuple[pd.DataFrame, pd.Series]:
    features = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
    labels = pd.Series(iris_data.target).map({
        k: v for k, v in enumerate(iris_data.target_names)
    })
    return features, labels

In [8]:
@returns(pd.DataFrame, pd.Series, test_features=pd.DataFrame, test_labels=pd.Series)
def create_train_test_split(features: pd.DataFrame, labels: pd.Series, stratified_sampler: StratifiedShuffleSplit):
    """
    Splits the dataset into training and testing sets.
    Saves the test set into the meta to be consumed by a later step.
    """
    train_idx, test_idx = next(stratified_sampler.split(features, labels))
    return (
        features.loc[train_idx], labels.loc[train_idx], 
        {"test_features": features.loc[test_idx], "test_labels": labels.loc[test_idx]}
    )

In [9]:
def train_model(features: pd.DataFrame, labels: pd.Series, clf: BaseEstimator) -> BaseEstimator:
    """
    Extracts clf from meta and fits to training data
    """
    clf.fit(features, labels)
    return clf

In [10]:
def validate_model(clf: BaseEstimator, test_features: pd.DataFrame, test_labels: pd.Series) -> float:
    """
    Extracts test data from meta and scores the classifier
    """
    return clf.score(test_features, test_labels)

### Constructing the pipeline object

In [11]:
pipeline: Callable[[BaseEstimator, TestSize], float] = Pipeline(
    initialise_sampler,
    load_iris_data,
    convert_to_pandas,
    create_train_test_split,
    train_model,
    validate_model
)

### Running the pipeline on a classifier

In [12]:
clf = LogisticRegression(multi_class="ovr", penalty="l2", solver='lbfgs')
pipeline(clf=clf, test_size=0.4)

0.8833333333333333