# SLEP 014 - Array Out

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### preprocessing - Pandas input

`set_config(array_out)` only controls the output of transform. This implementation is able to get the feature names of the input if it is a supported (xarray and pandas).

In [2]:
# Default
X, y = make_classification()
X_df = pd.DataFrame(X, columns=(f'col_{i}' for i in range(X.shape[1])))

scaler = StandardScaler()
scaler.fit_transform(X_df)

array([[ 1.356447  ,  0.81611184,  0.78867229, ..., -1.32250758,
         1.60110248, -1.14469095],
       [ 0.15101098, -0.69564911, -0.7135259 , ...,  0.59768628,
        -0.39200721,  0.83907429],
       [-0.67648145,  0.04162263,  0.87103981, ..., -0.44517499,
         0.17817167,  0.79102917],
       ...,
       [ 1.29774852, -0.90813987,  0.2843654 , ..., -0.13550371,
        -0.21146928, -0.44301369],
       [ 1.85161806,  2.62210009,  0.6353359 , ..., -0.06132012,
        -1.1847646 , -1.44258382],
       [ 0.91654212, -0.08760267, -0.05942394, ..., -0.48928776,
         1.13734533,  0.07960359]])

In [3]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_df)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,1.356447,0.816112,0.788672,0.570354,0.684400,-0.470715,-0.039886,0.970753,-1.054728,-0.430772,1.984878,1.046568,0.045881,-1.237080,-0.203623,0.851948,-0.455446,-1.322508,1.601102,-1.144691
1,0.151011,-0.695649,-0.713526,0.791866,1.384597,-1.366565,-1.252703,1.580561,1.741660,0.564771,0.550676,1.909699,-0.192716,0.256784,-0.654087,-0.876959,-1.546416,0.597686,-0.392007,0.839074
2,-0.676481,0.041623,0.871040,0.259214,-0.014030,0.011369,-0.738231,0.266978,0.370829,1.560032,-1.722668,0.133896,0.335425,-0.247864,-0.931789,0.429766,-0.150230,-0.445175,0.178172,0.791029
3,0.426148,1.279425,0.299828,-0.765316,-1.972809,-2.366709,-0.537182,-1.867669,0.063663,0.387561,0.257060,-2.512851,0.792381,-1.594538,0.916416,-0.542662,0.156995,0.592176,-0.713517,-0.472415
4,1.031910,-0.620521,0.130774,-1.691176,1.093072,-2.038797,-1.366303,-1.205104,-0.351485,-0.797000,0.682569,0.179325,1.542882,0.345876,-2.451171,0.621671,0.173384,-0.755557,-0.481320,-0.966010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.843265,1.464696,0.750894,-0.693124,-0.733977,-0.713373,0.489840,-1.127329,0.088484,-1.794042,-0.221711,-1.169090,-0.345916,1.502949,-0.354544,-0.026824,2.060491,1.172367,-2.376351,-0.099026
96,0.490457,-0.682399,-0.068754,0.821546,-1.609283,-0.099668,0.258974,0.007565,0.653234,-0.809367,1.012795,-1.220697,1.678377,-0.114610,0.408030,-0.084089,-0.337733,-0.076514,-0.256521,0.882835
97,1.297749,-0.908140,0.284365,0.524564,-0.526102,-0.317198,0.688611,0.273552,-0.593388,-1.082077,-0.336417,-0.252271,-1.605595,0.635821,-0.436481,-0.715243,-0.016927,-0.135504,-0.211469,-0.443014
98,1.851618,2.622100,0.635336,-3.179808,0.553705,-0.297366,-0.610987,-3.070552,0.636130,0.338906,0.276452,-1.241359,-0.993249,-0.028071,0.237403,0.274399,0.993243,-0.061320,-1.184765,-1.442584


In [4]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# this was fitted on a pandas dataframe and the output is in xarray
scaler.fit_transform(X_df)

### preprocessing - xarray input

In [5]:
X_xr = xr.DataArray(X_df)

In [6]:
# xarray in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_xr)

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
dim_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1.356447,0.816112,0.788672,0.570354,0.684400,-0.470715,-0.039886,0.970753,-1.054728,-0.430772,1.984878,1.046568,0.045881,-1.237080,-0.203623,0.851948,-0.455446,-1.322508,1.601102,-1.144691
1,0.151011,-0.695649,-0.713526,0.791866,1.384597,-1.366565,-1.252703,1.580561,1.741660,0.564771,0.550676,1.909699,-0.192716,0.256784,-0.654087,-0.876959,-1.546416,0.597686,-0.392007,0.839074
2,-0.676481,0.041623,0.871040,0.259214,-0.014030,0.011369,-0.738231,0.266978,0.370829,1.560032,-1.722668,0.133896,0.335425,-0.247864,-0.931789,0.429766,-0.150230,-0.445175,0.178172,0.791029
3,0.426148,1.279425,0.299828,-0.765316,-1.972809,-2.366709,-0.537182,-1.867669,0.063663,0.387561,0.257060,-2.512851,0.792381,-1.594538,0.916416,-0.542662,0.156995,0.592176,-0.713517,-0.472415
4,1.031910,-0.620521,0.130774,-1.691176,1.093072,-2.038797,-1.366303,-1.205104,-0.351485,-0.797000,0.682569,0.179325,1.542882,0.345876,-2.451171,0.621671,0.173384,-0.755557,-0.481320,-0.966010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.843265,1.464696,0.750894,-0.693124,-0.733977,-0.713373,0.489840,-1.127329,0.088484,-1.794042,-0.221711,-1.169090,-0.345916,1.502949,-0.354544,-0.026824,2.060491,1.172367,-2.376351,-0.099026
96,0.490457,-0.682399,-0.068754,0.821546,-1.609283,-0.099668,0.258974,0.007565,0.653234,-0.809367,1.012795,-1.220697,1.678377,-0.114610,0.408030,-0.084089,-0.337733,-0.076514,-0.256521,0.882835
97,1.297749,-0.908140,0.284365,0.524564,-0.526102,-0.317198,0.688611,0.273552,-0.593388,-1.082077,-0.336417,-0.252271,-1.605595,0.635821,-0.436481,-0.715243,-0.016927,-0.135504,-0.211469,-0.443014
98,1.851618,2.622100,0.635336,-3.179808,0.553705,-0.297366,-0.610987,-3.070552,0.636130,0.338906,0.276452,-1.241359,-0.993249,-0.028071,0.237403,0.274399,0.993243,-0.061320,-1.184765,-1.442584


In [7]:
# xarray in - xarray out
sklearn.set_config(array_out='xarray')
scaler.fit_transform(X_xr)

## Feature Selection

In [8]:
selector = SelectPercentile()

In [9]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
selector.fit_transform(X_df, y)

Unnamed: 0,col_4,col_11
0,0.447978,1.413429
1,0.953547,2.569173
2,-0.056316,0.191349
3,-1.470631,-3.352681
4,0.743055,0.252179
...,...,...
95,-0.576146,-1.553367
96,-1.208151,-1.622470
97,-0.426052,-0.325734
98,0.353611,-1.650136


In [10]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# the repr is pretty long here
type(selector.fit_transform(X_df, y))

xarray.core.dataarray.DataArray

## PCA

Note the names of the columns

In [11]:
pca = PCA(n_components=2)

In [12]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
pca.fit_transform(X_df, y)

Unnamed: 0,pca0,pca1
0,-1.441070,-1.249519
1,-3.272548,-1.223280
2,-0.807302,0.248715
3,4.724726,0.931272
4,2.184702,-2.241249
...,...,...
95,2.018311,1.281780
96,0.404812,1.361957
97,-0.307628,0.539200
98,5.968299,-2.327750


## Imputer

Imputer with `add_indicator=True` will add new columns

In [13]:
X1 = np.array([[np.nan, 1, 3],
               [4, 0, np.nan],
               [8, 1, 0]])
X1_df = pd.DataFrame(X1, columns=[f'col_{i}' for i in range(3)])

In [14]:
# pandas in - pandas out
# add_indicator=True
imp_mean = KNNImputer(n_neighbors=2, add_indicator=True)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2,mask_col_0,mask_col_2
0,6.0,1.0,3.0,1.0,0.0
1,4.0,0.0,1.5,0.0,1.0
2,8.0,1.0,0.0,0.0,0.0


In [15]:
# pandas in - pandas out
# add_indicator=False
imp_mean = KNNImputer(n_neighbors=2, add_indicator=False)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2
0,6.0,1.0,3.0
1,4.0,0.0,1.5
2,8.0,1.0,0.0


## Vectorizer

In [16]:
sklearn.set_config(array_out='pandas')
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

X

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [17]:
X.dtypes

and         Sparse[float64, 0.0]
document    Sparse[float64, 0.0]
first       Sparse[float64, 0.0]
is          Sparse[float64, 0.0]
one         Sparse[float64, 0.0]
second      Sparse[float64, 0.0]
the         Sparse[float64, 0.0]
third       Sparse[float64, 0.0]
this        Sparse[float64, 0.0]
dtype: object

In [18]:
sklearn.set_config(array_out='xarray')
vectorizer.fit_transform(corpus)

## Column Transformer

In [19]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    n_jobs=2)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [20]:
sklearn.set_config(array_out='pandas')
_ = clf.fit(X, y)

In [21]:
# get names
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [22]:
# dense
output.dtypes

age                 float64
fare                float64
embarked_C          float64
embarked_Q          float64
embarked_S          float64
embarked_missing    float64
sex_female          float64
sex_male            float64
pclass_1.0          float64
pclass_2.0          float64
pclass_3.0          float64
dtype: object

In [23]:
# get names with only transforming the first row
output = clf[:-1].transform(X.iloc[:1])
output.columns

Index(['age', 'fare', 'embarked_C', 'embarked_Q', 'embarked_S',
       'embarked_missing', 'sex_female', 'sex_male', 'pclass_1.0',
       'pclass_2.0', 'pclass_3.0'],
      dtype='object')

In [24]:
# force output transform to be sparse
clf.set_params(preprocessor__sparse_threshold=0.5)
_ = clf.fit(X, y)

In [25]:
clf['preprocessor'].sparse_output_

True

In [26]:
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [27]:
# sparse output
output.dtypes

age                 Sparse[float64, 0.0]
fare                Sparse[float64, 0.0]
embarked_C          Sparse[float64, 0.0]
embarked_Q          Sparse[float64, 0.0]
embarked_S          Sparse[float64, 0.0]
embarked_missing    Sparse[float64, 0.0]
sex_female          Sparse[float64, 0.0]
sex_male            Sparse[float64, 0.0]
pclass_1.0          Sparse[float64, 0.0]
pclass_2.0          Sparse[float64, 0.0]
pclass_3.0          Sparse[float64, 0.0]
dtype: object

### Using xarray

Need to install `pip install sparse` to use enable sparse support

In [28]:
sklearn.set_config(array_out='xarray')
_ = clf.fit(X, y)

In [29]:
output = clf[:-1].transform(X)
output