Skip to content

IbisML and GridSearchCV #136

Open
Open
@koaning

Description

@koaning

I may have found a bug related to how IbisML integrates with grid search from sklearn, but it could be that this is out of scope of the project.

import ibis_ml as iml
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

tfm = iml.Recipe(
    iml.Cast(iml.numeric(), "float32")
)
pipe = make_pipeline(tfm, GradientBoostingRegressor())

GridSearchCV(pipe, param_grid={}, cv=5).fit(X_train, y_train).cv_results_

This gave me this error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File <timed exec>:7

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/base.py:1473](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/base.py#line=1472), in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:968](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=967), in BaseSearchCV.fit(self, X, y, **params)
    962     results = self._format_results(
    963         all_candidate_params, n_splits, all_out, all_more_results
    964     )
    966     return results
--> 968 self._run_search(evaluate_candidates)
    970 # multimetric is determined here because in the case of a callable
    971 # self.scoring the return type is only known after calling
    972 first_test_score = all_out[0]["test_scores"]

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:1543](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=1542), in GridSearchCV._run_search(self, evaluate_candidates)
   1541 def _run_search(self, evaluate_candidates):
   1542     """Search all candidates in param_grid"""
-> 1543     evaluate_candidates(ParameterGrid(self.param_grid))

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py:914](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_search.py#line=913), in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    906 if self.verbose > 0:
    907     print(
    908         "Fitting {0} folds for each of {1} candidates,"
    909         " totalling {2} fits".format(
    910             n_splits, n_candidates, n_candidates * n_splits
    911         )
    912     )
--> 914 out = parallel(
    915     delayed(_fit_and_score)(
    916         clone(base_estimator),
    917         X,
    918         y,
    919         train=train,
    920         test=test,
    921         parameters=parameters,
    922         split_progress=(split_idx, n_splits),
    923         candidate_progress=(cand_idx, n_candidates),
    924         **fit_and_score_kwargs,
    925     )
    926     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    927         enumerate(candidate_params),
    928         enumerate(cv.split(X, y, **routed_params.splitter.split)),
    929     )
    930 )
    932 if len(out) < 1:
    933     raise ValueError(
    934         "No fits were performed. "
    935         "Was the CV iterator empty? "
    936         "Were there no candidates?"
    937     )

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py:67](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py#line=66), in Parallel.__call__(self, iterable)
     62 config = get_config()
     63 iterable_with_config = (
     64     (_with_config(delayed_func, config), args, kwargs)
     65     for delayed_func, args, kwargs in iterable
     66 )
---> 67 return super().__call__(iterable_with_config)

File [~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py:1863](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py#line=1862), in Parallel.__call__(self, iterable)
   1861     output = self._get_sequential_output(iterable)
   1862     next(output)
-> 1863     return output if self.return_generator else list(output)
   1865 # Let's create an ID that uniquely identifies the current call. If the
   1866 # call is interrupted early and that the same instance is immediately
   1867 # re-used, this id will be used to prevent workers that were
   1868 # concurrently finalizing a task from the previous call to run the
   1869 # callback.
   1870 with self._lock:

File [~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py:1792](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/joblib/parallel.py#line=1791), in Parallel._get_sequential_output(self, iterable)
   1790 self.n_dispatched_batches += 1
   1791 self.n_dispatched_tasks += 1
-> 1792 res = func(*args, **kwargs)
   1793 self.n_completed_tasks += 1
   1794 self.print_progress()

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py:129](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/parallel.py#line=128), in _FuncWrapper.__call__(self, *args, **kwargs)
    127     config = {}
    128 with config_context(**config):
--> 129     return self.function(*args, **kwargs)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:880](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py#line=879), in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    876     estimator = estimator.set_params(**clone(parameters, safe=False))
    878 start_time = time.time()
--> 880 X_train, y_train = _safe_split(estimator, X, y, train)
    881 X_test, y_test = _safe_split(estimator, X, y, test, train)
    883 result = {}

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/metaestimators.py:161](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/metaestimators.py#line=160), in _safe_split(estimator, X, y, indices, train_indices)
    158     X_subset = _safe_indexing(X, indices)
    160 if y is not None:
--> 161     y_subset = _safe_indexing(y, indices)
    162 else:
    163     y_subset = None

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:269](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=268), in _safe_indexing(X, indices, axis)
    267     return _array_indexing(X, indices, indices_dtype, axis=axis)
    268 else:
--> 269     return _list_indexing(X, indices, indices_dtype)

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:60](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=59), in _list_indexing(X, key, key_dtype)
     58     return list(compress(X, key))
     59 # key is a integer array-like of key
---> 60 return [X[idx] for idx in key]

File [~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py:60](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/sklearn/utils/_indexing.py#line=59), in <listcomp>(.0)
     58     return list(compress(X, key))
     59 # key is a integer array-like of key
---> 60 return [X[idx] for idx in key]

File [~/Development/probabl/venv/lib/python3.11/site-packages/ibis/expr/types/generic.py:1374](http://localhost:8888/lab/tree/~/Development/probabl/venv/lib/python3.11/site-packages/ibis/expr/types/generic.py#line=1373), in Column.__getitem__(self, _)
   1373 def __getitem__(self, _):
-> 1374     raise TypeError(
   1375         f"{self.__class__.__name__!r} is not subscriptable: "
   1376         "see https://ibis-project.org/tutorial/ibis-for-pandas-users/#ibis-for-pandas-users for details."
   1377     )

TypeError: 'IntegerColumn' is not subscriptable: see https://ibis-project.org/tutorial/ibis-for-pandas-users/#ibis-for-pandas-users for details.

It seems that the issues originates from the Ibis side of things, hence the ping. If this is out of scope for this project though I will gladly hear it.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    Status

    backlog

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions