# Pandas output

In this notebook, we will learn about pandas output in scikit-learn.

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-intro-v2/blob/main/notebooks/05-pandas-output.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-intro-v2/main/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.2"), "Please install scikit-learn 1.2"

## Load wine data set

In [None]:
from sklearn.datasets import load_wine

In [None]:
wine = load_wine(as_frame=True)
X, y = wine.data, wine.target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

In [None]:
X_train

## Default Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X_train)

In [None]:
X_scaled[:5]

In [None]:
X_train.shape

In [None]:
X_scaled.shape

### Scalar with pandas out!

In [None]:
scaler = StandardScaler()
scaler.set_output(transform="pandas")

In [None]:
X_scaled = scaler.fit_transform(X_train)

In [None]:
X_scaled

## In a pipeline (Default)

In [None]:
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import make_pipeline

In [None]:
pipe = make_pipeline(
    StandardScaler(),
    SelectPercentile(percentile=50),
)

In [None]:
X_transformed = pipe.fit_transform(X_train, y_train)

In [None]:
X_transformed[:5]

### Pipeline with pandas

In [None]:
pipe.set_output(transform="pandas")

In [None]:
X_transformed = pipe.fit_transform(X_train, y_train)

In [None]:
X_transformed

## With Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipe = make_pipeline(
    SelectPercentile(percentile=50),
    RandomForestClassifier()
)
pipe.set_output(transform="pandas")

In [None]:
pipe.fit(X_train, y_train);

In [None]:
pipe.score(X_test, y_test)

In [None]:
pipe[-1]

### Get importances and the corresponding features

In [None]:
importances = pipe[-1].feature_importances_
importances

In [None]:
feature_names = pipe[-1].feature_names_in_
feature_names

In [None]:
import pandas as pd
importances_series = pd.Series(importances, index=feature_names).sort_values()
importances_series.plot.barh();

## Exercise 1

1. 

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression

In [None]:
steel = fetch_openml("pc3", parser="pandas", as_frame=True, version=1)

In [None]:
X, y = steel.data, steel.target

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

In [None]:
log_reg = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver="liblinear", penalty="l1")
)
log_reg.set_output(transform="pandas")

In [None]:
log_reg.fit(X_train, y_train);

In [None]:
log_reg.score(X_test, y_test)

In [None]:
coefs = log_reg[-1].coef_.flatten()
feature_names_in = log_reg[-1].feature_names_in_

In [None]:
coefs_series = pd.Series(coefs, index=feature_names_in).sort_values()

In [None]:
coefs_series.plot.barh(figsize=(12, 8));