From 1bc58f412ef3a2c9c8353d65ec39dd9fdc6f1eb9 Mon Sep 17 00:00:00 2001 From: vecxoz Date: Wed, 30 Oct 2019 11:06:19 +0200 Subject: [PATCH] Fill gaps in doc (X_test can be None). Add some tests --- tests/test_func_api_regression.py | 82 ++++++++++++++++++++++++++++++- vecstack/core.py | 14 ++++-- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/tests/test_func_api_regression.py b/tests/test_func_api_regression.py index 05bf9f3..a086401 100644 --- a/tests/test_func_api_regression.py +++ b/tests/test_func_api_regression.py @@ -828,9 +828,13 @@ def test_exceptions(self): X_train, y_train, X_test, verbose=25) # Internal function model_action - assert_raises(ValueError, model_action, LinearRegression(), - X_train, y_train, X_test, sample_weight=None, + assert_raises(ValueError, model_action, LinearRegression(), + X_train, y_train, X_test, sample_weight=None, action='abc', transform=None) + + # X_test is None when mode != 'oof' + assert_raises(ValueError, stacking, [LinearRegression()], + X_train, y_train, None, mode='oof_pred_bag') #--------------------------------------------------------------------------- # Testing parameter warnings @@ -940,6 +944,80 @@ def test_small_input(self): assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) + #--------------------------------------------------------------------------- + # Mode 'oof', X_test=None + #--------------------------------------------------------------------------- + + def test_oof_mode_with_none(self): + + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + S_test_1 = None + + models = [LinearRegression()] + S_train_2, S_test_2 = stacking(models, X_train, y_train, None, + regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, + mode = 'oof', random_state = 0, verbose = 0) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name, allow_pickle=True) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + + #--------------------------------------------------------------------------- + # All default values (mode='oof_pred_bag') + #--------------------------------------------------------------------------- + + def test_all_defaults(self): + + # Override global n_folds=5, because default value in stacking function is 4 + n_folds=4 + + S_test_temp = np.zeros((X_test.shape[0], n_folds)) + kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0) + for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): + # Split data and target + X_tr = X_train[tr_index] + y_tr = y_train[tr_index] + X_te = X_train[te_index] + y_te = y_train[te_index] + model = LinearRegression() + _ = model.fit(X_tr, y_tr) + S_test_temp[:, fold_counter] = model.predict(X_test) + S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1) + + model = LinearRegression() + S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, + n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) + + models = [LinearRegression()] + S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, save_dir=temp_dir) + + # Load OOF from file + # Normally if cleaning is performed there is only one .npy file at given moment + # But if we have no cleaning there may be more then one file so we take the latest + file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file + S = np.load(file_name, allow_pickle=True) + S_train_3 = S[0] + S_test_3 = S[1] + + assert_array_equal(S_train_1, S_train_2) + assert_array_equal(S_test_1, S_test_2) + + assert_array_equal(S_train_1, S_train_3) + assert_array_equal(S_test_1, S_test_3) + + #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- diff --git a/vecstack/core.py b/vecstack/core.py index 5deaf67..8c64f2c 100644 --- a/vecstack/core.py +++ b/vecstack/core.py @@ -151,13 +151,14 @@ def stacking(models, X_train, y_train, X_test, y_train : numpy 1d array Target values - X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features] + X_test : numpy array or sparse matrix of N-dim shape, e.g. 2-dim [n_test_samples, n_features], or None Test data + Note: X_test can be set to None when mode='oof' - sample_weight : numpy array of shape [n_train_samples] + sample_weight : numpy array of shape [n_train_samples], default None Individual weights for each sample (passed to fit method of the model). - Note: sample_weight has length of full training set X_train and it would be - split automatically for each fold. + Note: sample_weight must have the same length as full training set X_train. + It will be split automatically for each fold. regression : boolean, default True If True - perform stacking for regression task, @@ -188,7 +189,7 @@ def stacking(models, X_train, y_train, X_test, mode: str, default 'oof_pred_bag' (alias 'A') Note: for detailes see terminology below - 'oof' - return only oof + 'oof' - return only oof. X_test can be set to None 'oof_pred' (alias 'B') - return oof and pred 'oof_pred_bag' (alias 'A') - return oof and bagged pred 'pred' - return pred only @@ -406,6 +407,9 @@ def your_metric(y_true, y_pred): # If empty list if 0 == len(models): raise ValueError('List of models is empty') + # X_test can be None only if mode='oof' + if X_test is None and mode != 'oof': + raise ValueError("X_test can be None only if mode='oof'") # Check arrays # y_train and sample_weight must be 1d ndarrays (i.e. row, not column) X_train, y_train = check_X_y(X_train,