# SLEP 014 - Array Out

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### preprocessing - Pandas input

`set_config(array_out)` only controls the output of transform. This implementation is able to get the feature names of the input if it is a supported (xarray and pandas).

In [2]:
# Default
X, y = make_classification()
X_df = pd.DataFrame(X, columns=(f'col_{i}' for i in range(X.shape[1])))

scaler = StandardScaler()
scaler.fit_transform(X_df)

array([[ 0.63236858,  0.21438214,  1.7305847 , ..., -1.16869063,
        -0.04020017,  0.24258487],
       [ 0.28683736, -0.20041406, -0.68556735, ..., -0.73653533,
        -1.95162888,  0.22661225],
       [ 0.86142301, -0.27681874, -0.79742916, ..., -1.26227069,
         2.31063319,  1.66578982],
       ...,
       [-0.5289726 ,  1.11220522,  0.4354815 , ...,  0.91209632,
        -2.46102226, -0.0157497 ],
       [-0.57394583, -0.65427898, -0.09586423, ...,  0.21780662,
        -0.47750872,  1.29246982],
       [-0.58103198, -0.04873699, -0.03395456, ...,  0.38229092,
        -0.27101632, -1.90236372]])

In [3]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_df)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
0,0.632369,0.214382,1.730585,-0.632916,0.983327,1.436727,0.036759,0.603812,-0.643070,-0.931671,0.045001,-0.815554,0.662897,0.913613,0.658616,-2.816002,1.014942,-1.168691,-0.040200,0.242585
1,0.286837,-0.200414,-0.685567,-0.287274,0.038054,-1.308685,-2.072445,-0.150272,-0.912111,1.823055,-2.043897,0.241335,-0.522293,0.829353,-0.938965,0.346189,-0.624927,-0.736535,-1.951629,0.226612
2,0.861423,-0.276819,-0.797429,-0.861868,-1.676123,1.134977,0.137678,1.181443,0.946478,1.127219,0.570643,1.864574,0.139041,0.581718,0.307115,0.147065,0.216268,-1.262271,2.310633,1.665790
3,-1.294626,-0.717511,0.325531,1.294161,-0.042162,-0.066734,0.054547,-0.546027,0.052209,1.484199,-0.601244,0.373635,-1.541794,1.626995,-0.418678,-0.352756,-0.437925,0.652737,-1.230644,1.132151
4,0.899362,0.810353,0.494106,-0.899466,-0.295647,-0.874470,-1.517465,0.902679,-0.083239,0.097578,-0.406492,-0.070919,0.262960,-0.188728,0.613548,0.471568,0.382574,-0.921838,-0.081353,-0.287214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.267178,-1.616993,-0.194178,-0.267575,-1.548648,-0.190753,0.202341,0.163017,0.097494,1.811433,0.091365,1.437020,-1.569897,0.750542,0.702727,-0.435093,-1.059524,-0.675126,-1.423211,-0.836922
96,0.402611,-0.937611,-0.559314,-0.402879,-0.031552,-0.363500,1.321404,0.357976,0.570886,0.341317,-0.900997,-0.791391,1.159672,0.406161,-0.880293,-0.793725,-0.655697,-0.656759,-0.896870,-0.249110
97,-0.528973,1.112205,0.435482,0.529371,-0.605549,-0.951807,-0.396096,0.268098,-1.858793,2.278664,0.643485,0.381354,0.622670,-0.632555,1.559842,-0.020960,-0.975797,0.912096,-2.461022,-0.015750
98,-0.573946,-0.654279,-0.095864,0.573675,1.575630,-1.005735,0.936550,-1.189769,0.944943,1.106661,0.358427,-0.161811,0.044758,0.865162,0.378175,-1.681452,0.403669,0.217807,-0.477509,1.292470


In [4]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# this was fitted on a pandas dataframe and the output is in xarray
scaler.fit_transform(X_df)

### preprocessing - xarray input

In [5]:
X_xr = xr.DataArray(X_df)

In [6]:
# xarray in - pandas out
sklearn.set_config(array_out='pandas')
scaler.fit_transform(X_xr)

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19
dim_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0.632369,0.214382,1.730585,-0.632916,0.983327,1.436727,0.036759,0.603812,-0.643070,-0.931671,0.045001,-0.815554,0.662897,0.913613,0.658616,-2.816002,1.014942,-1.168691,-0.040200,0.242585
1,0.286837,-0.200414,-0.685567,-0.287274,0.038054,-1.308685,-2.072445,-0.150272,-0.912111,1.823055,-2.043897,0.241335,-0.522293,0.829353,-0.938965,0.346189,-0.624927,-0.736535,-1.951629,0.226612
2,0.861423,-0.276819,-0.797429,-0.861868,-1.676123,1.134977,0.137678,1.181443,0.946478,1.127219,0.570643,1.864574,0.139041,0.581718,0.307115,0.147065,0.216268,-1.262271,2.310633,1.665790
3,-1.294626,-0.717511,0.325531,1.294161,-0.042162,-0.066734,0.054547,-0.546027,0.052209,1.484199,-0.601244,0.373635,-1.541794,1.626995,-0.418678,-0.352756,-0.437925,0.652737,-1.230644,1.132151
4,0.899362,0.810353,0.494106,-0.899466,-0.295647,-0.874470,-1.517465,0.902679,-0.083239,0.097578,-0.406492,-0.070919,0.262960,-0.188728,0.613548,0.471568,0.382574,-0.921838,-0.081353,-0.287214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.267178,-1.616993,-0.194178,-0.267575,-1.548648,-0.190753,0.202341,0.163017,0.097494,1.811433,0.091365,1.437020,-1.569897,0.750542,0.702727,-0.435093,-1.059524,-0.675126,-1.423211,-0.836922
96,0.402611,-0.937611,-0.559314,-0.402879,-0.031552,-0.363500,1.321404,0.357976,0.570886,0.341317,-0.900997,-0.791391,1.159672,0.406161,-0.880293,-0.793725,-0.655697,-0.656759,-0.896870,-0.249110
97,-0.528973,1.112205,0.435482,0.529371,-0.605549,-0.951807,-0.396096,0.268098,-1.858793,2.278664,0.643485,0.381354,0.622670,-0.632555,1.559842,-0.020960,-0.975797,0.912096,-2.461022,-0.015750
98,-0.573946,-0.654279,-0.095864,0.573675,1.575630,-1.005735,0.936550,-1.189769,0.944943,1.106661,0.358427,-0.161811,0.044758,0.865162,0.378175,-1.681452,0.403669,0.217807,-0.477509,1.292470


In [7]:
# xarray in - xarray out
sklearn.set_config(array_out='xarray')
scaler.fit_transform(X_xr)

## Feature Selection

In [8]:
selector = SelectPercentile()

In [9]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
selector.fit_transform(X_df, y)

Unnamed: 0,col_0,col_13
0,0.909002,1.002060
1,0.471667,0.897403
2,1.198913,0.589820
3,-1.529973,1.888138
4,1.246932,-0.367135
...,...,...
95,0.446785,0.799513
96,0.618200,0.371765
97,-0.560894,-0.918402
98,-0.617816,0.941881


In [10]:
# pandas in - xarray out
sklearn.set_config(array_out='xarray')

# the repr is pretty long here
type(selector.fit_transform(X_df, y))

xarray.core.dataarray.DataArray

## PCA

Note the names of the columns

In [11]:
pca = PCA(n_components=2)

In [12]:
# pandas in - pandas out
sklearn.set_config(array_out='pandas')
pca.fit_transform(X_df, y)

Unnamed: 0,pca0,pca1
0,-1.155867,0.013525
1,-0.522198,-1.144584
2,-1.911563,1.706858
3,2.711625,-0.190182
4,-2.024084,-0.608718
...,...,...
95,-0.658886,-0.370436
96,-0.515730,0.924810
97,0.903791,-1.625577
98,1.682238,1.282564


## Imputer

Imputer with `add_indicator=True` will add new columns

In [13]:
X1 = np.array([[np.nan, 1, 3],
               [4, 0, np.nan],
               [8, 1, 0]])
X1_df = pd.DataFrame(X1, columns=[f'col_{i}' for i in range(3)])

In [14]:
# pandas in - pandas out
# add_indicator=True
imp_mean = KNNImputer(n_neighbors=2, add_indicator=True)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2,mask_col_0,mask_col_2
0,6.0,1.0,3.0,1.0,0.0
1,4.0,0.0,1.5,0.0,1.0
2,8.0,1.0,0.0,0.0,0.0


In [15]:
# pandas in - pandas out
# add_indicator=False
imp_mean = KNNImputer(n_neighbors=2, add_indicator=False)
imp_mean.fit_transform(X1_df)

Unnamed: 0,col_0,col_1,col_2
0,6.0,1.0,3.0
1,4.0,0.0,1.5
2,8.0,1.0,0.0


## Vectorizer

In [16]:
sklearn.set_config(array_out='pandas')
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

X

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [17]:
X.dtypes

and         Sparse[float64, 0.0]
document    Sparse[float64, 0.0]
first       Sparse[float64, 0.0]
is          Sparse[float64, 0.0]
one         Sparse[float64, 0.0]
second      Sparse[float64, 0.0]
the         Sparse[float64, 0.0]
third       Sparse[float64, 0.0]
this        Sparse[float64, 0.0]
dtype: object

In [18]:
sklearn.set_config(array_out='xarray')
vectorizer.fit_transform(corpus)

## Column Transformer

In [19]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    n_jobs=2)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [20]:
sklearn.set_config(array_out='pandas')
_ = clf.fit(X, y)

AttributeError: '_ManyDataAdapter' object has no attribute 'needs_feature_names_in'

In [None]:
# get names
output = clf[:-1].transform(X)
output

In [None]:
# dense
output.dtypes

In [None]:
# get names with only transforming the first row
output = clf[:-1].transform(X.iloc[:1])
output.columns

In [None]:
# force output transform to be sparse
clf.set_params(preprocessor__sparse_threshold=0.5)
_ = clf.fit(X, y)

In [None]:
clf['preprocessor'].sparse_output_

In [None]:
output = clf[:-1].transform(X)
output

In [None]:
# sparse output
output.dtypes

### Using xarray

Need to install `pip install sparse` to use enable sparse support

In [None]:
sklearn.set_config(array_out='xarray')
_ = clf.fit(X, y)

In [None]:
output = clf[:-1].transform(X)
output