# Investigating the Impact of Cluster Sampling on the Size of the Training, Testing, and Validation Sets

When training the MVN NGBoost model on the drifter data, we must take extra care to ensure that information about the testing and validation sets are not inadvertedly introduced into the training data. The ~400,000 drifter observations that form our data set come from ~2000 unique drifters. We expect that observations taken by the same drifter are likely to be highly correlated so we ensure that all of the observations made by any single drifter are in precisely one of the training, testing or validation sets. Ensuring observations from each drifter are not split between sets will ensure that the training data does not contain any extra information via correlation.

O'Malley et al. (2023) deal with this issue using cluster sampling. That is, spliting the data into clusters defined by their corresponding drifter ID, then randomly sampling the clusters into the training, testing, and validation sets containing 81%, 10% and 9% of the drifter IDs respectively. However, there is significant variation between the number of observations found in each of the drifter ID clusters meaning that the propertion of the overall data found in each of the sets may be significantly different than the nominal 81-10-9 split. At its most extreme, this discrepancy could result in testing and training sets that are of comporable sizes. 

In this notebook, we will investigate whether the sizes of the training, testing, and validation sets that result from 81-10-9 cluster sampling differ signficantly from the nominal 81-10-9 values.

In [1]:
# import modules and load data
import pandas as pd
import numpy as np
path_to_data = '../data/filtered_nao_drifters_with_sst_gradient.h5'
data = pd.read_hdf(path_to_data)
# add day of the year as an index (to be added to the data later)
data['day_of_year'] = data['time'].apply(lambda t : t.timetuple().tm_yday)

In [82]:
from scipy.stats import ttest_1samp

def TOST(train_test_val_proportions,train_test_val_flag,delta,popmean):
    train_proportions = train_test_val_proportions[:,train_test_val_flag]
    _, p1 = ttest_1samp(train_proportions-popmean, -delta,
                        axis=None, nan_policy='propagate', alternative='less')

    _, p2 = ttest_1samp(train_proportions-popmean, delta,
                        axis=None, nan_policy='propagate', alternative='greater')

    set_type = ['training', 'testing', 'validation']
    alpha = 0.05
    print("\n ------------- Test 1 -------------")
    if p1 < alpha:
        print(f"\nReject H0: The mean proportion of the data assigned to {set_type[train_test_val_flag]}, mu - {popmean} <= -{delta}")
    else:
        print(f"\nFail to reject H0: The mean proportion of the data assigned to {set_type[train_test_val_flag]}, mu - {popmean} > -{delta}")
    print(f"\n p-value for Test 1: {p1:.3f}")

    print("\n ------------- Test 2 -------------")
    if p2 < alpha:
        print(f"\nReject H0: The mean proportion of the data assigned to {set_type[train_test_val_flag]}, mu - {popmean} >= {delta}")
    else:
        print(f"\nFail to reject H0: The mean proportion of the data assigned to {set_type[train_test_val_flag]}, mu - {popmean} < {delta}")
    print(f"\n p-value for Test 2: {p2:.3f}")    



In [38]:
# prediction intervals (re do this later)
from scipy.stats import t

def prediction_interval(data, future_sample_size=1, alpha=0.05):
    n = len(data) 
    mean = np.mean(data)
    std = np.std(data, ddof=1) 
    
    t_crit = t.ppf(1 - alpha / 2, df=n - 1)

    se_prediction = np.sqrt(std**2 + (std**2 / future_sample_size))  # for individual or sample mean
    
    # Prediction interval
    margin_of_error = t_crit * se_prediction
    lower = round((mean - margin_of_error)*100,3)
    upper = round((mean + margin_of_error)*100,3)
    
    return lower, upper


In [2]:
# split the drifter IDs into training, testing and validation 

from sklearn.model_selection import train_test_split

def train_test_validation_split(X, Y,*,
                                test_frac = 0.10, validation_frac = 0.09, 
                                random_state = None, shuffle = True, stratify = None):
    
    X_aux, X_test, Y_aux, Y_test = train_test_split(X, Y, 
                                                        test_size=test_frac, random_state = random_state, shuffle = shuffle, stratify = stratify)
    if validation_frac == 0:
        return X_aux, X_test, Y_aux, Y_test
    else:
        X_train, X_val, Y_train, Y_val = train_test_split(X_aux, Y_aux,
                                                        test_size=validation_frac/(1 - test_frac), random_state = random_state, shuffle = shuffle, stratify = stratify)
        return X_train, X_test, X_val, Y_train, Y_test, Y_val

In [47]:
number_of_samples = len(data.index)
N = 100000 # number of repeats for hypothesis tests

count_by_id = data.groupby('id').size()
X, Y = np.array(count_by_id.index), np.array(count_by_id)

----------------

From Mike's code

In [41]:
'''MV_Prediction/experiments/dispatcher.py lines 31-39'''
def random_id_subset(ids, pc=0.1):
    unique_id = np.unique(ids)
    N_unique = len(unique_id)
    np.random.shuffle(unique_id)
    in_test = int(N_unique * pc)
    test_ids = unique_id[:in_test]
    test_mask = np.in1d(ids, test_ids)
    train_mask = np.invert(test_mask)
    return train_mask, test_mask

In [80]:
''' from MV_Prediction/experiments/dispatcher.py'''
ids = X.copy()
N_runs = 100000                                               # L97 (adaptated)
shuffle_seed = 500                                              # L80
np.random.seed(shuffle_seed)                                    # L98
splits = [random_id_subset(ids) for _ in range(N_runs)]         # L99 (adapted)
total_data_size = count_by_id.sum()

OM_train_test_val_proportions = []

for (train_mask, test_mask) in splits:                                # L101 (adapted)
    test_ids = ids[test_mask]                                         # L103 (adapted)
    train_ids = ids[train_mask] # auxillary set                       # L106 (adapted)

    train_mask, valid_mask = random_id_subset(train_ids, pc=0.1)      # L48 (adapted)
    new_train_ids = train_ids[train_mask]
    valid_ids = train_ids[valid_mask]
    
    # get the proportion of each set
    train_size = (count_by_id[new_train_ids]).sum()
    test_size = (count_by_id[test_ids]).sum()
    validation_size = (count_by_id[valid_ids]).sum()

    OM_train_test_val_proportions.append([(count_by_id[new_train_ids]).sum(), (count_by_id[test_ids]).sum(),(count_by_id[valid_ids]).sum()])


OM_train_test_val_proportions = np.array(OM_train_test_val_proportions)/total_data_size

[[0.82136512 0.09261713 0.08601775]
 [0.81650599 0.09698113 0.08651288]
 [0.81260073 0.09789373 0.08950554]
 ...
 [0.79931215 0.10324314 0.09744471]
 [0.82890381 0.09372634 0.07736986]
 [0.79873692 0.10423341 0.09702967]]


In [94]:
delta = 0.00005

print(f"Resulting in the amount of data in each training, testing, and validation set differing from nominal values by at most {int(np.floor(delta*number_of_samples))} observations")

Resulting in the amount of data in each training, testing, and validation set differing from nominal values by at most 20 observations


In [95]:
# TOST for training data

popmean = 0.81
train_test_val_flag = 0 # training set

TOST(OM_train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Fail to reject H0: The mean proportion of the data assigned to training, mu - 0.81 > -5e-05

 p-value for Test 1: 1.000

 ------------- Test 2 -------------

Reject H0: The mean proportion of the data assigned to training, mu - 0.81 >= 5e-05

 p-value for Test 2: 0.000


In [96]:
# TOST for testing data

popmean = 0.10
train_test_val_flag = 1 # testing set

TOST(OM_train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Reject H0: The mean proportion of the data assigned to testing, mu - 0.1 <= -5e-05

 p-value for Test 1: 0.000

 ------------- Test 2 -------------

Fail to reject H0: The mean proportion of the data assigned to testing, mu - 0.1 < 5e-05

 p-value for Test 2: 1.000


In [97]:
# TOST for validation data

popmean = 0.09
train_test_val_flag = 2 # validation set

TOST(OM_train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Reject H0: The mean proportion of the data assigned to validation, mu - 0.09 <= -5e-05

 p-value for Test 1: 0.000

 ------------- Test 2 -------------

Fail to reject H0: The mean proportion of the data assigned to validation, mu - 0.09 < 5e-05

 p-value for Test 2: 1.000


In [98]:
set_type = ['train', 'test', 'validation']

for ii,name in enumerate(set_type):
    print(f"\nPrediction interval ({name} proportions): {prediction_interval(OM_train_test_val_proportions[:,ii], future_sample_size=1, alpha=0.05)}")



Prediction interval (train proportions): (79.042, 83.069)

Prediction interval (test proportions): (8.439, 11.517)

Prediction interval (validation proportions): (7.5, 10.434)


--------------------

In [99]:
# randomly split drifter IDs into 81-10-9 and calculate the proportion of data in each of the sets

train_test_val_proportions = []

for ii in range(N):
    _,_,_,Y_train,Y_test,Y_val = train_test_validation_split(X, Y,
                                                             test_frac = 0.10, validation_frac = 0.09)
    train_test_val_proportions.append([sum(Y_train),sum(Y_test),sum(Y_val)])

train_test_val_proportions = np.array(train_test_val_proportions)/number_of_samples

## Testing

With the proportions of data in training, testing, and cross validations set calculated above for `N=100000` repetitions, we will test the following hypotheses:


### Two One-Sided Student's $t$-Tests (TOST)

Since we are working within an application, if the training, testing, validation split differs very slightly from 81-10-9, the impact of this will be negligible in practice so we will allow for the mean proportion to differ from 0.81 up to $\delta = 5\times 10^{-5}$. Since the sample means of the training, testing, validation data proportion approximately follow normal distributions, respectively (CLT) and each combination of training, testing, and validation sets is independent, we can use the two-sided Student's $t$-test.

Test 1

$H_0^{(1)}$: The mean proportion of the data assigned to training, $\mu - 0.81 \leq \delta$.

$H_1^{(1)}$: The mean proportion of the data assigned to training, $\mu - 0.81 > \delta$.

Test 2

$H_0^{(2)}$: The mean proportion of the data assigned to training, $\mu - 0.81 \geq \delta$.

$H_1^{(2)}$: The mean proportion of the data assigned to training, $\mu - 0.81 < \delta$.

Significance Level: 5%

and similarly for the proportion of the data assigned to the testing and validation sets.


In [100]:
# TOST for training data

popmean = 0.81
train_test_val_flag = 0 # training set

TOST(train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Reject H0: The mean proportion of the data assigned to training, mu - 0.81 <= -5e-05

 p-value for Test 1: 0.000

 ------------- Test 2 -------------

Fail to reject H0: The mean proportion of the data assigned to training, mu - 0.81 < 5e-05

 p-value for Test 2: 1.000


In [101]:
# TOST for testing data

popmean = 0.10
train_test_val_flag = 1 # testing set

TOST(train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Fail to reject H0: The mean proportion of the data assigned to testing, mu - 0.1 > -5e-05

 p-value for Test 1: 1.000

 ------------- Test 2 -------------

Reject H0: The mean proportion of the data assigned to testing, mu - 0.1 >= 5e-05

 p-value for Test 2: 0.000


In [102]:
# TOST for validation data

popmean = 0.09
train_test_val_flag = 2 # validation set

TOST(train_test_val_proportions,train_test_val_flag,delta,popmean)


 ------------- Test 1 -------------

Fail to reject H0: The mean proportion of the data assigned to validation, mu - 0.09 > -5e-05

 p-value for Test 1: 1.000

 ------------- Test 2 -------------

Reject H0: The mean proportion of the data assigned to validation, mu - 0.09 >= 5e-05

 p-value for Test 2: 0.007


In [40]:
set_type = ['train', 'test', 'validation']

for ii,name in enumerate(set_type):
    print(f"\nPrediction interval ({name} proportions): {prediction_interval(train_test_val_proportions[:,ii], future_sample_size=1, alpha=0.05)}")



Prediction interval (train proportions): (78.961, 82.979)

Prediction interval (test proportions): (8.48, 11.56)

Prediction interval (validation proportions): (7.537, 10.482)


## Discussion

1. Whether the proportion of the overall data contained within the training, testing, and validation sets are practically equal to the nominal nominal 81-10-9 proportions.

The TOST analysis above shows that cluster sampling the drifter data by ID according to a training, testing, validation data split of 81-10-9, on average, leads to a training, testing, and validation datasets that form within 

* $(80.05 \leq \_ < 81.05)\%$, 
* $(9.05 \leq \_ < 10.05)\%$, 
* $(8.05 < \_ < 9.05)\%$ 

of the total drifter dataset, respectively at the 5% significance level.

2. The nominal proportion of the overall dataset for all three data set types are found in their respective 95% prediction intervals. However, individual variability between the number of observations in each drifter ID cluster results in wide prediction intervals, thus for each train-test-validation split, there may be notable variability from the nominal values.

The 95% prediction intervals (the intervals in which the proportion of the overall data each set will contain 95% of the time) are:

|Data subset| Nominal Proportion| 95% Prediction Interval|
|---|---|---|
|Training |81\%| $(79.0, 83.0)\%$|
|Testing| $10\%$ | $(8.5, 11.6)\%$|
|Validation| $9\%$| $(7.5, 10.5)\%$|

There is not very much uncertainty in the mean (TOST). Individual variability is where the potential problem lies (Prediction Intervals).

It's up to my discretion as to whether this is a problem

In [135]:
# finding a seed that gives similar values to the nominal ones

def nominal_cluster_sampling(data,*,
                             test_frac = 0.10, validation_frac = 0.09, 
                             tol = 5e-5):
    jj = 0

    number_of_samples = len(data.index)
    count_by_id = data.groupby('id').size()
    X, Y = np.array(count_by_id.index), np.array(count_by_id)
    train_frac = 1-test_frac-validation_frac

    actual_test_frac = 0.0
    actual_train_frac = 0.0
    actual_validation_frac = 0.0
    
    while (np.abs(
        np.array([actual_test_frac - test_frac
                           , actual_train_frac -train_frac, 
                           actual_validation_frac - validation_frac])) > tol).any():
        
        seed = np.random.seed(jj)
        
        _,_,_,Y_train,Y_test,Y_val = train_test_validation_split(X, Y,
                                                                    test_frac = test_frac, validation_frac = validation_frac, random_state=seed)
        actual_train_frac, actual_test_frac, actual_validation_frac = np.array([sum(Y_train),sum(Y_test),sum(Y_val)])/number_of_samples

        jj += 1
    
    print(np.abs(
        np.array([actual_test_frac - test_frac
                           , actual_train_frac -train_frac, 
                           actual_validation_frac - validation_frac])))
    return(jj-1, Y_train, Y_test, Y_val)

In [136]:
seed, Y_train, Y_test, Y_train = nominal_cluster_sampling(data)

[9.22312188e-06 3.76691715e-05 2.84460496e-05]


In [140]:
print(seed)

63012


In [137]:
_,_,_,Y_train,Y_test,Y_val = train_test_validation_split(X, Y,
                                                                    test_frac = 0.1, validation_frac = 0.09, random_state=np.random.seed(seed))
actual_train_frac, actual_test_frac, actual_validation_frac = np.array([sum(Y_train),sum(Y_test),sum(Y_val)])/number_of_samples

In [139]:
print(np.array([sum(Y_train),sum(Y_test),sum(Y_val)])/number_of_samples)

[0.81003767 0.09999078 0.08997155]
