Merge 25de6f3 into cd7f6ef

vecxoz · Jun 23, 2019 · 416388d · 416388d
2 parents cd7f6ef + 25de6f3
commit 416388d
Show file tree

Hide file tree

Showing 7 changed files with 318 additions and 165 deletions.
diff --git a/tests/test_func_api_classification_binary.py b/tests/test_func_api_classification_binary.py
diff --git a/tests/test_func_api_classification_multiclass.py b/tests/test_func_api_classification_multiclass.py
diff --git a/tests/test_func_api_regression.py b/tests/test_func_api_regression.py
@@ -24,7 +24,7 @@
 from scipy.sparse import coo_matrix
 from sklearn.model_selection import cross_val_predict
 from sklearn.model_selection import cross_val_score
-from sklearn.model_selection import train_test_split
+# from sklearn.model_selection import train_test_split
 from sklearn.model_selection import KFold
 from sklearn.datasets import load_boston
 from sklearn.metrics import mean_absolute_error
@@ -39,7 +39,27 @@
 
 boston = load_boston()
 X, y = boston.data, boston.target
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
+# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
+
+
+# Make train/test split by hand to avoid strange errors probably related to testing suit:
+# https://github.com/scikit-learn/scikit-learn/issues/1684
+# https://github.com/scikit-learn/scikit-learn/issues/1704
+# Note: Python 2.7, 3.4 - OK, but 3.5, 3.6 - error
+
+np.random.seed(0)
+ind = np.arange(500)
+np.random.shuffle(ind)
+
+ind_train = ind[:400]
+ind_test = ind[400:]
+
+X_train = X[ind_train]
+X_test = X[ind_test]
+
+y_train = y[ind_train]
+y_test = y[ind_test]
+
 
 #-------------------------------------------------------------------------------
 #-------------------------------------------------------------------------------

diff --git a/tests/test_sklearn_api_classification_binary.py b/tests/test_sklearn_api_classification_binary.py
diff --git a/tests/test_sklearn_api_classification_multiclass.py b/tests/test_sklearn_api_classification_multiclass.py
diff --git a/tests/test_sklearn_api_regression.py b/tests/test_sklearn_api_regression.py
@@ -25,7 +25,7 @@
 from sklearn.base import RegressorMixin
 from sklearn.model_selection import cross_val_predict
 from sklearn.model_selection import cross_val_score
-from sklearn.model_selection import train_test_split
+# from sklearn.model_selection import train_test_split
 from sklearn.model_selection import KFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import RandomizedSearchCV
@@ -50,7 +50,27 @@
 
 boston = load_boston()
 X, y = boston.data, boston.target
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
+# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
+
+
+# Make train/test split by hand to avoid strange errors probably related to testing suit:
+# https://github.com/scikit-learn/scikit-learn/issues/1684
+# https://github.com/scikit-learn/scikit-learn/issues/1704
+# Note: Python 2.7, 3.4 - OK, but 3.5, 3.6 - error
+
+np.random.seed(0)
+ind = np.arange(500)
+np.random.shuffle(ind)
+
+ind_train = ind[:400]
+ind_test = ind[400:]
+
+X_train = X[ind_train]
+X_test = X[ind_test]
+
+y_train = y[ind_train]
+y_test = y[ind_test]
+
 
 # -----------------------------------------------------------------------------
 # Scikit-learn INcompatible estimator 

diff --git a/vecstack/coresk.py b/vecstack/coresk.py
@@ -851,6 +851,35 @@ def _estimator_action(self, estimator, X_train, y_train, X_test,
     # -------------------------------------------------------------------------
     # -------------------------------------------------------------------------
 
+    def _random_choice(self, n, size, bound=2**30):
+        """
+        Memory efficient (but slower) version of np.random.choice
+
+        Parameters:
+        ===========
+        n : int
+            Upper value for range to chose from: [0, n).
+            This parameter is bounded (see bound).
+        size: int
+            Number of values to chose
+        bound : int
+            Upper random int for backward compatibility
+            with some older numpy versions
+
+        Returns:
+        ========
+        ids : 1d numpy array of shape (size, ) dtype=np.int32
+        """
+        ids = []
+        while len(ids) < size:
+            rnd = np.random.randint(min(bound, n))
+            if rnd not in ids:
+                ids.append(rnd)
+        return np.array(ids, dtype=np.int32)
+
+    # -------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
+
     def _get_footprint(self, X, n_items=1000):
         """Selects ``n_items`` random elements from 2d numpy array or
         sparse matrix (or all elements if their number is less or equal
@@ -861,7 +890,11 @@ def _get_footprint(self, X, n_items=1000):
             r, c = X.shape
             n = r * c
             # np.random.seed(0) # for development
-            ids = np.random.choice(n, min(n_items, n), replace=False)
+
+            # OOM with large arrays (see #29)
+            # ids = np.random.choice(n, min(n_items, n), replace=False)
+
+            ids = self._random_choice(n, min(n_items, n))
 
             for i in ids:
                 row = i // c