# SLEP 014 - Array Out

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### preprocessing - Pandas input

`set_config(array_out)` only controls the output of transform. This implementation is able to get the feature names of the input if it is a supported (xarray and pandas).

In [2]:
# Default
X, y = make_classification()
X_df = pd.DataFrame(X, columns=(f'col_{i}' for i in range(X.shape[1])))

scaler = StandardScaler()
scaler.fit_transform(X_df)

array([[-0.39867235,  1.18473668,  1.20494208, ...,  0.06229269,
        -0.70141154,  0.16954979],
       [ 0.92878237, -0.26047046,  0.63406026, ..., -0.63195192,
         0.39732725, -1.07566067],
       [-0.14461934, -1.14735331, -1.33341382, ..., -0.93883853,
        -0.00916972,  0.41170781],
       ...,
       [ 1.38566055, -1.29496577,  0.43767395, ...,  1.59036131,
        -0.0101716 , -1.50425165],
       [-0.97401946, -0.57539366,  1.16213316, ...,  0.9452667 ,
        -0.82534698,  0.76665964],
       [ 1.74973346,  0.13722354,  0.28104896, ..., -1.33894385,
         0.76203891, -1.8457575 ]])

In [3]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_df)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,-0.398672,1.184737,1.204942,1.617942,0.456715,-1.617035,0.463816,0.649817,0.528221,0.068504,-0.149695,0.083117,0.571341,0.500291,0.245118,-0.745248,-0.094948,0.062293,-0.701412,0.169550
1,0.928782,-0.260470,0.634060,-1.685819,-0.871921,1.089503,-0.501481,1.094239,1.348791,-1.317823,-0.715659,-0.092660,-1.094154,-0.848422,-2.035259,-0.577405,-0.770059,-0.631952,0.397327,-1.075661
2,-0.144619,-1.147353,-1.333414,0.546481,0.386168,0.480302,0.747235,0.544793,-0.230769,0.798637,0.194281,-0.736609,-0.615303,0.018332,0.693910,0.955847,2.068566,-0.938839,-0.009170,0.411708
3,-0.082910,0.427630,-2.232391,-1.861177,2.888396,0.609907,-0.157065,-1.996808,-0.248138,-0.323789,-0.405915,-0.890960,-0.196478,-0.124759,-0.032577,-0.657444,-0.368234,1.201237,0.922912,0.526389
4,-1.169628,0.718057,0.761641,-0.180379,-1.069799,0.995147,-0.527740,-0.939047,-1.018803,0.111292,0.161688,-1.052788,-0.501163,1.212192,-0.015703,0.452361,-1.378811,-0.457553,-1.049005,1.046005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.369800,-0.941111,0.874274,-0.415392,-0.319538,0.066545,1.204313,0.733084,1.004304,0.630047,-0.739140,0.039036,-1.716519,-0.280505,-0.431531,2.088119,-0.307211,-0.473274,-1.411629,-0.551275
96,1.823732,-1.696503,0.140537,-0.705628,-0.830240,-2.118823,0.971211,-1.389206,0.799538,-0.679603,-2.132970,0.268352,1.116251,-1.767716,0.628620,0.133021,-0.398354,-1.138097,-0.365929,-1.893674
97,1.385661,-1.294966,0.437674,-1.187107,-0.944736,0.677519,-0.209969,-0.821286,0.955307,1.689500,-1.071744,1.667669,-1.834020,-1.312608,-0.963890,1.267678,0.615854,1.590361,-0.010172,-1.504252
98,-0.974019,-0.575394,1.162133,0.188288,0.106384,1.122962,-0.161002,0.395979,1.216517,2.023348,-0.250688,-0.534834,-0.829221,1.058107,0.517300,1.099640,1.034596,0.945267,-0.825347,0.766660


In [4]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# this was fitted on a pandas dataframe and the output is in xarray
scaler.fit_transform(X_df)

### preprocessing - xarray input

In [5]:
X_xr = xr.DataArray(X_df)

In [6]:
# xarray in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_xr)

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
dim_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-0.398672,1.184737,1.204942,1.617942,0.456715,-1.617035,0.463816,0.649817,0.528221,0.068504,-0.149695,0.083117,0.571341,0.500291,0.245118,-0.745248,-0.094948,0.062293,-0.701412,0.169550
1,0.928782,-0.260470,0.634060,-1.685819,-0.871921,1.089503,-0.501481,1.094239,1.348791,-1.317823,-0.715659,-0.092660,-1.094154,-0.848422,-2.035259,-0.577405,-0.770059,-0.631952,0.397327,-1.075661
2,-0.144619,-1.147353,-1.333414,0.546481,0.386168,0.480302,0.747235,0.544793,-0.230769,0.798637,0.194281,-0.736609,-0.615303,0.018332,0.693910,0.955847,2.068566,-0.938839,-0.009170,0.411708
3,-0.082910,0.427630,-2.232391,-1.861177,2.888396,0.609907,-0.157065,-1.996808,-0.248138,-0.323789,-0.405915,-0.890960,-0.196478,-0.124759,-0.032577,-0.657444,-0.368234,1.201237,0.922912,0.526389
4,-1.169628,0.718057,0.761641,-0.180379,-1.069799,0.995147,-0.527740,-0.939047,-1.018803,0.111292,0.161688,-1.052788,-0.501163,1.212192,-0.015703,0.452361,-1.378811,-0.457553,-1.049005,1.046005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.369800,-0.941111,0.874274,-0.415392,-0.319538,0.066545,1.204313,0.733084,1.004304,0.630047,-0.739140,0.039036,-1.716519,-0.280505,-0.431531,2.088119,-0.307211,-0.473274,-1.411629,-0.551275
96,1.823732,-1.696503,0.140537,-0.705628,-0.830240,-2.118823,0.971211,-1.389206,0.799538,-0.679603,-2.132970,0.268352,1.116251,-1.767716,0.628620,0.133021,-0.398354,-1.138097,-0.365929,-1.893674
97,1.385661,-1.294966,0.437674,-1.187107,-0.944736,0.677519,-0.209969,-0.821286,0.955307,1.689500,-1.071744,1.667669,-1.834020,-1.312608,-0.963890,1.267678,0.615854,1.590361,-0.010172,-1.504252
98,-0.974019,-0.575394,1.162133,0.188288,0.106384,1.122962,-0.161002,0.395979,1.216517,2.023348,-0.250688,-0.534834,-0.829221,1.058107,0.517300,1.099640,1.034596,0.945267,-0.825347,0.766660


In [7]:
# xarray in - xarray out
sklearn.set_config(array_out='xarray')
scaler.fit_transform(X_xr)

## Feature Selection

In [8]:
selector = SelectPercentile()

In [9]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
selector.fit_transform(X_df, y)

Unnamed: 0,col_2,col_13
0,1.540582,0.496614
1,0.839386,-0.747034
2,-1.577198,0.052200
3,-2.681382,-0.079745
4,0.996090,1.153058
...,...,...
95,1.134433,-0.223358
96,0.233208,-1.594715
97,0.598172,-1.175060
98,1.488001,1.010976


In [10]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# the repr is pretty long here
type(selector.fit_transform(X_df, y))

xarray.core.dataarray.DataArray

## PCA

Note the names of the columns

In [11]:
pca = PCA(n_components=2)

In [12]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
pca.fit_transform(X_df, y)

Unnamed: 0,pca0,pca1
0,-0.807541,-0.579474
1,1.852399,-1.341401
2,-0.054654,1.438483
3,-0.307193,2.448628
4,-2.101223,-0.096826
...,...,...
95,0.878862,-1.512174
96,3.288825,-0.433149
97,2.561718,-2.609962
98,-1.765448,-1.191540


## Imputer

Imputer with `add_indicator=True` will add new columns

In [13]:
X1 = np.array([[np.nan, 1, 3],
               [4, 0, np.nan],
               [8, 1, 0]])
X1_df = pd.DataFrame(X1, columns=[f'col_{i}' for i in range(3)])

In [14]:
# pandas in - pandas out
# add_indicator=True
imp_mean = KNNImputer(n_neighbors=2, add_indicator=True)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2,mask_col_0,mask_col_2
0,6.0,1.0,3.0,1.0,0.0
1,4.0,0.0,1.5,0.0,1.0
2,8.0,1.0,0.0,0.0,0.0


In [15]:
# pandas in - pandas out
# add_indicator=False
imp_mean = KNNImputer(n_neighbors=2, add_indicator=False)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2
0,6.0,1.0,3.0
1,4.0,0.0,1.5
2,8.0,1.0,0.0


## Vectorizer

In [16]:
sklearn.set_config(array_out='pandas')
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

X

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [17]:
X.dtypes

and         Sparse[float64, 0.0]
document    Sparse[float64, 0.0]
first       Sparse[float64, 0.0]
is          Sparse[float64, 0.0]
one         Sparse[float64, 0.0]
second      Sparse[float64, 0.0]
the         Sparse[float64, 0.0]
third       Sparse[float64, 0.0]
this        Sparse[float64, 0.0]
dtype: object

In [18]:
sklearn.set_config(array_out='xarray')
vectorizer.fit_transform(corpus)

## Column Transformer

In [19]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    n_jobs=2)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [20]:
sklearn.set_config(array_out='pandas')
_ = clf.fit(X, y)

In [21]:
# get names
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [22]:
# dense
output.dtypes

age                 float64
fare                float64
embarked_C          float64
embarked_Q          float64
embarked_S          float64
embarked_missing    float64
sex_female          float64
sex_male            float64
pclass_1.0          float64
pclass_2.0          float64
pclass_3.0          float64
dtype: object

In [23]:
# get names with only transforming the first row
output = clf[:-1].transform(X.iloc[:1])
output.columns

Index(['age', 'fare', 'embarked_C', 'embarked_Q', 'embarked_S',
       'embarked_missing', 'sex_female', 'sex_male', 'pclass_1.0',
       'pclass_2.0', 'pclass_3.0'],
      dtype='object')

In [24]:
# force output transform to be sparse
clf.set_params(preprocessor__sparse_threshold=0.5)
_ = clf.fit(X, y)

In [25]:
clf['preprocessor'].sparse_output_

True

In [26]:
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [27]:
# sparse output
output.dtypes

age                 Sparse[float64, 0.0]
fare                Sparse[float64, 0.0]
embarked_C          Sparse[float64, 0.0]
embarked_Q          Sparse[float64, 0.0]
embarked_S          Sparse[float64, 0.0]
embarked_missing    Sparse[float64, 0.0]
sex_female          Sparse[float64, 0.0]
sex_male            Sparse[float64, 0.0]
pclass_1.0          Sparse[float64, 0.0]
pclass_2.0          Sparse[float64, 0.0]
pclass_3.0          Sparse[float64, 0.0]
dtype: object

### Using xarray

Need to install `pip install sparse` to use enable sparse support

In [28]:
sklearn.set_config(array_out='xarray')
_ = clf.fit(X, y)

In [29]:
output = clf[:-1].transform(X)
output