# SLEP 014 - Array Out

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### preprocessing - Pandas input

`set_config(array_out)` only controls the output of transform. This implementation is able to get the feature names of the input if it is a supported (xarray and pandas).

In [2]:
# Default
X, y = make_classification()
X_df = pd.DataFrame(X, columns=(f'col_{i}' for i in range(X.shape[1])))

scaler = StandardScaler()
scaler.fit_transform(X_df)

array([[-1.142615  ,  0.35660045,  0.39119639, ..., -1.33671682,
        -0.35406308, -0.32762651],
       [ 0.21798197,  0.97196834, -0.8301334 , ...,  0.78892356,
        -0.98043683,  1.05222696],
       [ 1.56250589,  0.33009103,  0.26027951, ...,  0.28638684,
        -0.32856842, -0.19881353],
       ...,
       [ 2.30234167,  0.85937643, -0.60675093, ..., -0.81013353,
        -0.86583186,  0.79984199],
       [-1.19920634,  1.41091527,  1.04043811, ..., -0.7312079 ,
        -1.40499192, -0.77585696],
       [ 1.23675109, -1.20629852,  0.28539902, ..., -2.15966551,
         1.21076593, -0.54185949]])

In [3]:
# pandas in - pandas out
set_config(array_out='pandas')
scaler.fit_transform(X_df)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,-1.142615,0.356600,0.391196,-1.947505,-0.680238,1.055237,-1.491447,0.653788,-1.751835,-1.209898,1.940920,0.242162,-0.566086,-1.402945,1.275400,0.133084,-1.144190,-1.336717,-0.354063,-0.327627
1,0.217982,0.971968,-0.830133,-0.378227,0.511708,-0.671199,-0.387730,1.322890,-0.044642,-0.335238,-1.722564,0.884816,-0.014638,0.391754,0.326041,0.773024,-0.072893,0.788924,-0.980437,1.052227
2,1.562506,0.330091,0.260280,-0.353139,1.207702,-0.788852,-0.642333,-0.209463,-0.056063,-0.108715,0.046162,-1.323372,0.628182,0.558387,-0.491091,0.135136,1.195388,0.286387,-0.328568,-0.198814
3,-0.421992,-0.728556,0.777572,-0.530912,0.186895,-0.302449,-1.159690,0.495416,-0.318027,2.187445,0.154679,-1.476541,-0.258160,0.156681,-0.705359,-1.406033,-0.651003,-0.176113,0.736164,-0.948046
4,-0.043267,-2.479235,0.776964,-0.847431,-0.451866,-0.023820,-0.192152,0.254927,-0.368668,-0.567186,1.106471,-0.635082,-0.067582,0.828497,2.020537,0.464375,-0.931836,-1.325375,2.489961,-1.308954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.133392,-1.189197,0.459526,1.493721,-0.625790,-0.819737,-0.677315,-2.039715,-0.075243,0.130327,-0.168514,-0.150165,-0.843906,-0.674203,-0.208983,2.562077,0.121997,1.558318,1.195046,-0.716938
96,-0.124893,-0.701258,0.395094,0.556675,-1.671541,0.520416,-0.319953,0.630670,0.303560,0.151361,2.523001,-0.648290,0.364053,1.535619,-0.554414,-0.377514,0.071349,1.107385,0.705715,-0.550083
97,2.302342,0.859376,-0.606751,0.997357,-0.147840,-0.084760,2.100667,-0.329645,0.988523,0.165532,-1.071725,0.946993,-0.853757,0.044250,-0.184916,-1.676320,-0.314634,-0.810134,-0.865832,0.799842
98,-1.199206,1.410915,1.040438,-0.619065,0.305786,1.822469,0.443582,-0.657797,-0.170729,0.114329,0.098598,0.087438,0.122496,-0.784002,-0.564732,-1.391911,2.024229,-0.731208,-1.404992,-0.775857


In [4]:
# pandas in - xarray out
set_config(array_out='xarray')

# this was fitted on a pandas dataframe and the output is in xarray
scaler.fit_transform(X_df)

### preprocessing - xarray input

In [5]:
X_xr = xr.DataArray(X_df)

In [6]:
# xarray in - pandas out
set_config(array_out='pandas')
scaler.fit_transform(X_xr)

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
dim_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.142615,0.356600,0.391196,-1.947505,-0.680238,1.055237,-1.491447,0.653788,-1.751835,-1.209898,1.940920,0.242162,-0.566086,-1.402945,1.275400,0.133084,-1.144190,-1.336717,-0.354063,-0.327627
1,0.217982,0.971968,-0.830133,-0.378227,0.511708,-0.671199,-0.387730,1.322890,-0.044642,-0.335238,-1.722564,0.884816,-0.014638,0.391754,0.326041,0.773024,-0.072893,0.788924,-0.980437,1.052227
2,1.562506,0.330091,0.260280,-0.353139,1.207702,-0.788852,-0.642333,-0.209463,-0.056063,-0.108715,0.046162,-1.323372,0.628182,0.558387,-0.491091,0.135136,1.195388,0.286387,-0.328568,-0.198814
3,-0.421992,-0.728556,0.777572,-0.530912,0.186895,-0.302449,-1.159690,0.495416,-0.318027,2.187445,0.154679,-1.476541,-0.258160,0.156681,-0.705359,-1.406033,-0.651003,-0.176113,0.736164,-0.948046
4,-0.043267,-2.479235,0.776964,-0.847431,-0.451866,-0.023820,-0.192152,0.254927,-0.368668,-0.567186,1.106471,-0.635082,-0.067582,0.828497,2.020537,0.464375,-0.931836,-1.325375,2.489961,-1.308954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.133392,-1.189197,0.459526,1.493721,-0.625790,-0.819737,-0.677315,-2.039715,-0.075243,0.130327,-0.168514,-0.150165,-0.843906,-0.674203,-0.208983,2.562077,0.121997,1.558318,1.195046,-0.716938
96,-0.124893,-0.701258,0.395094,0.556675,-1.671541,0.520416,-0.319953,0.630670,0.303560,0.151361,2.523001,-0.648290,0.364053,1.535619,-0.554414,-0.377514,0.071349,1.107385,0.705715,-0.550083
97,2.302342,0.859376,-0.606751,0.997357,-0.147840,-0.084760,2.100667,-0.329645,0.988523,0.165532,-1.071725,0.946993,-0.853757,0.044250,-0.184916,-1.676320,-0.314634,-0.810134,-0.865832,0.799842
98,-1.199206,1.410915,1.040438,-0.619065,0.305786,1.822469,0.443582,-0.657797,-0.170729,0.114329,0.098598,0.087438,0.122496,-0.784002,-0.564732,-1.391911,2.024229,-0.731208,-1.404992,-0.775857


In [7]:
# xarray in - xarray out
set_config(array_out='xarray')
scaler.fit_transform(X_xr)

## Feature Selection

In [8]:
selector = SelectPercentile()

In [9]:
# pandas in - pandas out
set_config(array_out='pandas')
selector.fit_transform(X_df, y)

Unnamed: 0,col_1,col_18
0,0.342681,-0.344500
1,1.074069,-1.100627
2,0.311173,-0.313724
3,-0.947068,0.971568
4,-3.027816,3.088664
...,...,...
95,-1.494557,1.525507
96,-0.914624,0.934811
97,0.940249,-0.962281
98,1.595773,-1.613128


In [10]:
# pandas in - xarray out
set_config(array_out='xarray')

# the repr is pretty long here
type(selector.fit_transform(X_df, y))

xarray.core.dataarray.DataArray

## PCA

Note the names of the columns

In [11]:
pca = PCA(n_components=2)

In [12]:
### pandas in - pandas out
set_config(array_out='pandas')
pca.fit_transform(X_df, y)

Unnamed: 0,pca0,pca1
0,-0.401215,0.538285
1,-0.934573,1.824744
2,-0.321926,-0.445245
3,0.718762,-1.693192
4,2.544506,-2.948795
...,...,...
95,1.116815,-2.772835
96,0.695889,-1.057872
97,-0.861526,1.872648
98,-2.626137,0.530346


## Imputer

Imputer with `add_indicator=True` will add new columns

In [13]:
X1 = np.array([[np.nan, 1, 3],
               [4, 0, np.nan],
               [8, 1, 0]])
X1_df = pd.DataFrame(X1, columns=[f'col_{i}' for i in range(3)])

In [14]:
# pandas in - pandas out
# add_indicator=True
imp_mean = KNNImputer(n_neighbors=2, add_indicator=True)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2,mask_col_0,mask_col_2
0,6.0,1.0,3.0,1.0,0.0
1,4.0,0.0,1.5,0.0,1.0
2,8.0,1.0,0.0,0.0,0.0


In [15]:
# pandas in - pandas out
# add_indicator=True
imp_mean = KNNImputer(n_neighbors=2, add_indicator=False)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2
0,6.0,1.0,3.0
1,4.0,0.0,1.5
2,8.0,1.0,0.0


## Vectorizer

In [16]:
set_config(array_out='pandas')
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

X

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [17]:
X.dtypes

and         Sparse[float64, 0.0]
document    Sparse[float64, 0.0]
first       Sparse[float64, 0.0]
is          Sparse[float64, 0.0]
one         Sparse[float64, 0.0]
second      Sparse[float64, 0.0]
the         Sparse[float64, 0.0]
third       Sparse[float64, 0.0]
this        Sparse[float64, 0.0]
dtype: object

In [18]:
set_config(array_out='xarray')
vectorizer.fit_transform(corpus)

## Column Transformer

In [19]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    n_jobs=2)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [23]:
set_config(array_out='pandas')
_ = clf.fit(X, y)

In [24]:
# get names
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
# dense
output.dtypes

age                 float64
fare                float64
embarked_C          float64
embarked_Q          float64
embarked_S          float64
embarked_missing    float64
sex_female          float64
sex_male            float64
pclass_1.0          float64
pclass_2.0          float64
pclass_3.0          float64
dtype: object

In [26]:
# get names with only transforming the first row
output = clf[:-1].transform(X.iloc[:1])
output.columns

Index(['age', 'fare', 'embarked_C', 'embarked_Q', 'embarked_S',
       'embarked_missing', 'sex_female', 'sex_male', 'pclass_1.0',
       'pclass_2.0', 'pclass_3.0'],
      dtype='object')

In [27]:
# force output transform to be sparse
clf.set_params(preprocessor__sparse_threshold=0.5)
_ = clf.fit(X, y)

In [28]:
clf['preprocessor'].sparse_output_

True

In [29]:
output = clf[:-1].transform(X)
output

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_missing,sex_female,sex_male,pclass_1.0,pclass_2.0,pclass_3.0
0,-0.039005,3.442584,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-2.215952,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-2.131977,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.038512,2.286639,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-0.349075,2.286639,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,-0.194040,-0.503774,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [30]:
# sparse output
output.dtypes

age                 Sparse[float64, 0.0]
fare                Sparse[float64, 0.0]
embarked_C          Sparse[float64, 0.0]
embarked_Q          Sparse[float64, 0.0]
embarked_S          Sparse[float64, 0.0]
embarked_missing    Sparse[float64, 0.0]
sex_female          Sparse[float64, 0.0]
sex_male            Sparse[float64, 0.0]
pclass_1.0          Sparse[float64, 0.0]
pclass_2.0          Sparse[float64, 0.0]
pclass_3.0          Sparse[float64, 0.0]
dtype: object

### Using xarray

Need to install `pip install sparse` to use enable sparse support

In [31]:
set_config(array_out='xarray')
_ = clf.fit(X, y)

In [32]:
output = clf[:-1].transform(X)
output