# Prototype of scikit-learn returning pandas
This is a demo of using the `set_output` API

<a href="https://colab.research.google.com/github/thomasjpfan/pandas-prototype-demo/blob/main/index.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [1]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install https://github.com/thomasjpfan/pandas-prototype-demo/raw/main/scikit_learn-1.2.dev0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

## Output DataFrame from a Single Transformer

### Dense output

In [2]:
X, y = make_classification(random_state=10)
X_df = pd.DataFrame(X, columns=[f"feat{i}" for i in range(X.shape[1])])

In [3]:
ss = StandardScaler()

In [4]:
ss.set_output(transform="pandas_or_namedsparse")

In [5]:
ss.fit_transform(X_df)

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,feat11,feat12,feat13,feat14,feat15,feat16,feat17,feat18,feat19
0,-0.880302,-2.396050,0.547377,-1.048385,-0.196439,0.324293,1.028296,-0.033961,1.076357,0.169339,0.046997,1.133734,1.030797,0.343421,0.274000,-0.154912,-0.788005,2.449116,-0.656519,1.192702
1,-0.485728,0.821375,0.773077,-0.621103,1.161165,0.646107,-0.095423,0.386085,0.740957,-0.624221,-0.588954,0.706193,0.657085,0.215775,0.048452,0.450214,1.234648,-0.234250,-0.516478,-0.212921
2,1.453271,-0.513932,-0.244360,1.165128,-0.095607,0.580571,-1.323653,1.536119,-0.322979,-2.396531,-1.826031,-0.801868,1.355394,-0.468105,0.353205,-1.129164,-0.255249,0.848987,-0.962493,0.920906
3,-0.865731,0.951799,-0.243617,-0.762207,1.219421,0.109731,-0.530999,-1.554566,0.396346,-0.027220,-0.452759,0.606532,-1.375904,1.041786,-0.247953,0.652500,-0.971440,0.168071,0.326901,-0.882753
4,1.350254,-0.438955,1.057077,1.253385,-0.121307,2.791162,1.745860,0.099857,-0.837173,-1.054647,-0.768671,-1.068157,0.550125,1.651482,0.308716,-0.454322,-1.944132,-0.305758,-0.276173,0.447272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.555185,0.025810,-1.661675,1.345118,0.105961,-1.061226,-1.553790,-0.426551,1.388786,-0.990824,2.366643,-1.043985,-0.368661,0.985907,-0.017246,0.208875,-2.236604,-3.430184,-0.674425,0.245093
96,0.044539,0.992958,-1.119549,0.946968,-0.262715,2.665849,-0.173737,-0.510857,0.606263,1.411877,-0.436193,-1.748067,0.551530,0.302790,0.753142,-0.562016,-0.128492,-0.866437,3.267251,1.289739
97,0.723716,1.683737,0.555751,0.922472,-0.045143,-0.629533,0.528512,2.971062,0.879717,0.017493,-1.664795,-1.046627,0.585939,-0.125043,-2.797255,0.147586,1.184328,-0.095402,0.758870,1.361339
98,0.752143,-0.378988,-2.240845,0.892482,1.354957,0.279515,-0.564973,-0.576898,0.760334,-1.888393,0.514482,-0.962487,1.005699,0.463654,0.088371,-0.454042,-0.940322,-0.295258,0.549095,-2.516950


### Sparse output

In [6]:
X_csr = csr_matrix(X)
X_df_sp = pd.DataFrame.sparse.from_spmatrix(X_csr, columns=[f"feat{i}" for i in range(X.shape[1])])

ss_sp = StandardScaler(with_mean=False)
ss_sp.set_output(transform="pandas_or_namedsparse")

X_sp_trans = ss_sp.fit_transform(X_df_sp)

In [7]:
X_sp_trans

<100x20 sparse matrix of type '<class 'numpy.float64'>'
	with 2000 stored elements in Compressed Sparse Row format>

Custom subclass for sparse data for performance (Pandas's sparse extension arrays are not well suited for CSR matrices)

In [8]:
type(X_sp_trans)

sklearn.utils.output_container.NamedCSRMatrix

In [9]:
X_sp_trans.columns

array(['feat0', 'feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6',
       'feat7', 'feat8', 'feat9', 'feat10', 'feat11', 'feat12', 'feat13',
       'feat14', 'feat15', 'feat16', 'feat17', 'feat18', 'feat19'],
      dtype=object)

# Column Transformer with dataframe output

Column Transformer outputing dataframe

In [10]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)

  warn(


In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ["age", "fare"]),
        ("cat", categorical_transformer, ["embarked", "sex", "pclass"]),
    ]
)

In [12]:
preprocessor.set_output(transform="pandas_or_namedsparse")

In [13]:
X_trans = preprocessor.fit_transform(X)

In [14]:
X_trans

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## Cross validation and feature selection based on column metadata

Showcasing how to create a custom transformer used for feature selection:

In [15]:
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

class MyPandasFeatureSelector(TransformerMixin, BaseEstimator):
    
    def fit(self, X, y=None):
        assert hasattr(X, "iloc"), "Custom error! Input must be dataframe"
        self.feature_names_in_ = np.asarray(X.columns)
        self.n_features_in_ = X.shape[1]
        
        # A feature selection algorithm that uses column meta data
        self.selected_features_ = self.feature_names_in_[::2]
        return self
    
    def get_feature_names_out(self, input_features=None):
        return self.selected_features_
        
    def transform(self, X, y=None):
        return X[self.selected_features_]

In [16]:
pipe = make_pipeline(
    preprocessor,
    MyPandasFeatureSelector(),
    LogisticRegression(),
)
pipe.set_output(transform="pandas_or_namedsparse")

In [17]:
cross_val_score(pipe, X, y)

array([0.51526718, 0.77099237, 0.59160305, 0.72900763, 0.67049808])

## Using categorical data with OrdinalEncoder and HistGradientBoosting

Using the pandas categorical to select categorical features in HistGradientBoosting

In [18]:
tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", ["age", "fare"]),
        ("cat", OrdinalEncoder(), ["embarked", "sex", "pclass"]),
    ]
)
hist = make_pipeline(tree_preprocessor, HistGradientBoostingClassifier(categorical_features="pandas_category"))
hist.set_output(transform="pandas_or_namedsparse")

In [19]:
hist.fit(X, y)

Feature names are in the final step:

In [20]:
hist[-1].feature_names_in_

array(['age', 'fare', 'embarked', 'sex', 'pclass'], dtype=object)

Transforming a slice of the pipeline returns dataframe with categoricals 

In [21]:
hist[:-1].transform(X).dtypes

age          float64
fare         float64
embarked    category
sex         category
pclass      category
dtype: object

## Text with sparse datasets

Same performance for sparse data because of custom `SKCSRMatrix` Subclass

In [22]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'comp.sys.mac.hardware',
    'comp.os.ms-windows.misc',
]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                remove=remove)

text_train, y_train = data_train.data, data_train.target

In [23]:
pipe = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    LogisticRegression(solver="liblinear")
)

In [24]:
pipe.set_output(transform="pandas_or_namedsparse")

Uses custom `SKCSRMatrix` is used for passing column names

In [25]:
pipe.fit(text_train, y_train)

Feature names are in the final step

In [26]:
pipe[-1].feature_names_in_

array(['00', '000', '0002', ..., 'zzrk', 'zzy_3w', 'zzzoh'], dtype=object)