IbisML and GridSearchCV

I may have found a bug related to how IbisML integrates with grid search from sklearn, but it could be that this is out of scope of the project. 

```python
import ibis_ml as iml
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

tfm = iml.Recipe(
    iml.Cast(iml.numeric(), "float32")
)
pipe = make_pipeline(tfm, GradientBoostingRegressor())

GridSearchCV(pipe, param_grid={}, cv=5).fit(X_train, y_train).cv_results_
```

This gave me this error:

```python
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File <timed exec>:7

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/base.py:1473](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/base.py#line=1472), in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:968](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=967), in BaseSearchCV.fit(self, X, y, **params)
    962     results = self._format_results(
    963         all_candidate_params, n_splits, all_out, all_more_results
    964     )
    966     return results
--> 968 self._run_search(evaluate_candidates)
    970 # multimetric is determined here because in the case of a callable
    971 # self.scoring the return type is only known after calling
    972 first_test_score = all_out[0]["test_scores"]

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:1543](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=1542), in GridSearchCV._run_search(self, evaluate_candidates)
   1541 def _run_search(self, evaluate_candidates):
   1542     """Search all candidates in param_grid"""
-> 1543     evaluate_candidates(ParameterGrid(self.param_grid))

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:914](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=913), in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    906 if self.verbose > 0:
    907     print(
    908         "Fitting {0} folds for each of {1} candidates,"
    909         " totalling {2} fits".format(
    910             n_splits, n_candidates, n_candidates * n_splits
    911         )
    912     )
--> 914 out = parallel(
    915     delayed(_fit_and_score)(
    916         clone(base_estimator),
    917         X,
    918         y,
    919         train=train,
    920         test=test,
    921         parameters=parameters,
    922         split_progress=(split_idx, n_splits),
    923         candidate_progress=(cand_idx, n_candidates),
    924         **fit_and_score_kwargs,
    925     )
    926     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    927         enumerate(candidate_params),
    928         enumerate(cv.split(X, y, **routed_params.splitter.split)),
    929     )
    930 )
    932 if len(out) < 1:
    933     raise ValueError(
    934         "No fits were performed. "
    935         "Was the CV iterator empty? "
    936         "Were there no candidates?"
    937     )

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py:67](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py#line=66), in Parallel.__call__(self, iterable)
     62 config = get_config()
     63 iterable_with_config = (
     64     (_with_config(delayed_func, config), args, kwargs)
     65     for delayed_func, args, kwargs in iterable
     66 )
---> 67 return super().__call__(iterable_with_config)

File [~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py:1863](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py#line=1862), in Parallel.__call__(self, iterable)
   1861     output = self._get_sequential_output(iterable)
   1862     next(output)
-> 1863     return output if self.return_generator else list(output)
   1865 # Let's create an ID that uniquely identifies the current call. If the
   1866 # call is interrupted early and that the same instance is immediately
   1867 # re-used, this id will be used to prevent workers that were
   1868 # concurrently finalizing a task from the previous call to run the
   1869 # callback.
   1870 with self._lock:

File [~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py:1792](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py#line=1791), in Parallel._get_sequential_output(self, iterable)
   1790 self.n_dispatched_batches += 1
   1791 self.n_dispatched_tasks += 1
-> 1792 res = func(*args, **kwargs)
   1793 self.n_completed_tasks += 1
   1794 self.print_progress()

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py:129](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py#line=128), in _FuncWrapper.__call__(self, *args, **kwargs)
    127     config = {}
    128 with config_context(**config):
--> 129     return self.function(*args, **kwargs)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:880](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py#line=879), in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    876     estimator = estimator.set_params(**clone(parameters, safe=False))
    878 start_time = time.time()
--> 880 X_train, y_train = _safe_split(estimator, X, y, train)
    881 X_test, y_test = _safe_split(estimator, X, y, test, train)
    883 result = {}

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/metaestimators.py:161](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/metaestimators.py#line=160), in _safe_split(estimator, X, y, indices, train_indices)
    158     X_subset = _safe_indexing(X, indices)
    160 if y is not None:
--> 161     y_subset = _safe_indexing(y, indices)
    162 else:
    163     y_subset = None

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:269](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=268), in _safe_indexing(X, indices, axis)
    267     return _array_indexing(X, indices, indices_dtype, axis=axis)
    268 else:
--> 269     return _list_indexing(X, indices, indices_dtype)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:60](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=59), in _list_indexing(X, key, key_dtype)
     58     return list(compress(X, key))
     59 # key is a integer array-like of key
---> 60 return [X[idx] for idx in key]

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:60](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=59), in <listcomp>(.0)
     58     return list(compress(X, key))
     59 # key is a integer array-like of key
---> 60 return [X[idx] for idx in key]

File [~/Development/probabl/venv/lib/python3.11/site-packages/ibis/expr/types/generic.py:1374](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/ibis/expr/types/generic.py#line=1373), in Column.__getitem__(self, _)
   1373 def __getitem__(self, _):
-> 1374     raise TypeError(
   1375         f"{self.__class__.__name__!r} is not subscriptable: "
   1376         "see https://ibis-project.org/tutorial/ibis-for-pandas-users/#ibis-for-pandas-users for details."
   1377     )

TypeError: 'IntegerColumn' is not subscriptable: see https://ibis-project.org/tutorial/ibis-for-pandas-users/#ibis-for-pandas-users for details.
```

It seems that the issues originates from the Ibis side of things, hence the ping. If this is out of scope for this project though I will gladly hear it.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

IbisML and GridSearchCV #136

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

IbisML and GridSearchCV #136

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions