# Pandas output with scikit-learn's `set_output` API 

In [None]:
import sklearn

In [None]:
sklearn.__version__

## What is the `set_output` API?

In [None]:
import pandas as pd

In [None]:
X_df = pd.DataFrame({"age": [30, 40, 50],
                     "height": [120, 140, 160]})

In [None]:
X_df

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_out_np = scaler.fit_transform(X_df)

In [None]:
X_out_np

In [None]:
scaler.set_output(transform="pandas") 

In [None]:
X_out_df = scaler.fit_transform(X_df)

In [None]:
X_out_df

## What about a Pipeline?

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pipe = make_pipeline(StandardScaler(), PolynomialFeatures())
pipe.set_output(transform="pandas")

In [None]:
X_trans = pipe.fit_transform(X_df)

In [None]:
X_trans

## Heterogenous Data

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
titanic = fetch_openml("titanic", version=1, as_frame=True, parser="pandas")

In [None]:
X, y = titanic.data, titanic.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [None]:
X_train.head()

### Numerical Preprocessing

In [None]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

### Categorical Preprocessing

In [None]:
categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

### Combine them!

In [None]:
preprocessor = ColumnTransformer([
    ("numerical", numeric_transformer, numeric_features),
    ("categorical", categorical_transformer, categorical_features),
    verbose_feature_names_out=False
])

In [None]:
preprocessor.fit_transform(X_train)

### `set_output`!

In [None]:
preprocessor.set_output(transform="pandas")

In [None]:
X_train_out = preprocessor.fit_transform(X_train)

In [None]:
X_train_out

### Predictive Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

In [None]:
log_reg.set_output(transform="pandas")

In [None]:
_ = log_reg.fit(X_train, y_train)

## Feature names!

In [None]:
classifier = log_reg[-1]
classifier

In [None]:
classifier.feature_names_in_

In [None]:
classifier.coef_[0]

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 4), dpi=300)
_ = pd.Series(classifier.coef_[0], index=classifier.feature_names_in_).sort_values().plot.barh(ax=ax)

## Running on JupyterLite with Pyodide!

![](https://raw.githubusercontent.com/pyodide/pyodide/master/docs/_static/img/pyodide-logo.png)

## Run this notebook in your browser!

### This notebook is avaliable on JupyterLite:
### https://thomasjpfan.github.io/sklearn-set_output-material/lab/index.html

![](https://raw.githubusercontent.com/thomasjpfan/sklearn-set_output-material/main/qrcode_thomasjpfan.github.io.png)