# Prepare github data

It is necessary to run when you running in google colab

In [1]:
# !git clone https://github.com/andriygav/SampleSizeLib
# !pip install SampleSizeLib/src
# !git clone https://github.com/ttgadaev/SampleSizeEstimation.git
# !mv SampleSizeEstimation/datasets datasets
# !mv SampleSizeEstimation/dumps dumps

# Import packages

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

In [3]:
from samplesizelib.linear.statistical import LagrangeEstimator
from samplesizelib.linear.statistical import LikelihoodRatioEstimator
from samplesizelib.linear.statistical import WaldEstimator

from samplesizelib.linear.heuristic import CrossValidationEstimator
from samplesizelib.linear.heuristic import BootstrapEstimator
from samplesizelib.linear.heuristic import LogisticRegressionEstimator

from samplesizelib.linear.bayesian import APVCEstimator
from samplesizelib.linear.bayesian import ACCEstimator
from samplesizelib.linear.bayesian import ALCEstimator
from samplesizelib.linear.bayesian import MaxUtilityEstimator
from samplesizelib.linear.bayesian import KLEstimator

from samplesizelib.shared.utils import Dataset

from samplesizelib.linear.models import RegressionModel
from samplesizelib.linear.models import LogisticModel

In [4]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

# Experiment config for all models

In [5]:
NAME_TO_MODEL = {
    'LagrangeEstimator': LagrangeEstimator, 
    'LikelihoodRatioEstimator': LikelihoodRatioEstimator, 
    'WaldEstimator': WaldEstimator, 
    'CrossValidationEstimator': CrossValidationEstimator, 
    'BootstrapEstimator': BootstrapEstimator, 
    'LogisticRegressionEstimator': LogisticRegressionEstimator, 
    'APVCEstimator': APVCEstimator, 
    'ACCEstimator': ACCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'MaxUtilityEstimator': MaxUtilityEstimator, 
}

config = {
    'LagrangeEstimator': {'epsilon': 0.2, 
                          'alpha': 0.05, 
                          'beta': 0.2},
    'LikelihoodRatioEstimator': {'epsilon': 0.2, 
                                 'alpha': 0.05, 
                                 'beta': 0.2},
    'WaldEstimator': {'epsilon': 0.2, 
                      'alpha': 0.05, 
                      'beta': 0.2},
    'BootstrapEstimator': {'averaging': 100,
                           'epsilon': 0.5, 
                           'multiprocess': True, 
                           'progressbar': False},
    'CrossValidationEstimator': {'averaging': 100,
                                 'test_size': 0.5, 
                                 'epsilon': 0.05, 
                                 'multiprocess': True, 
                                 'progressbar': False},
    'APVCEstimator': {'averaging': 100,
                      'epsilon': 0.5, 
                      'multiprocess': True, 
                      'progressbar': False},
    'ACCEstimator': {'averaging': 100,
                     'length': 0.25,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': False},
    'ALCEstimator': {'averaging': 100,
                     'length': 0.5,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': False}
}

# The point where m* will be calculated
sample_sizes = [35, 70, 100, 120, 140, 160, 180, 200, 220, 240]
averaging = 50

# Boston Housing

In this computational experiment we are using Boston Housing dataset.

## Load dataset

In [6]:
dataset = pd.read_csv('datasets/boston.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
y = scale(y)
X = np.hstack((X, np.ones([len(X), 1])))

data = Dataset(X, y)

## Make prediction for different available sample size of dataset

### Prepara model

In [7]:
statmodel = RegressionModel

### LagrangeEstimator

In [8]:
key = 'LagrangeEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [14]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [15]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### LikelihoodRatioEstimator

In [8]:
key = 'LikelihoodRatioEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### WaldEstimator

In [8]:
key = 'WaldEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### BootstrapEstimator

In [8]:
key = 'BootstrapEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### CrossValidationEstimator

In [8]:
key = 'CrossValidationEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### APVCEstimator

In [8]:
key = 'APVCEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### ACCEstimator

In [8]:
key = 'ACCEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)

### ALCEstimator

In [8]:
key = 'ALCEstimator'
model = NAME_TO_MODEL[key](statmodel, **config[key])

result = dict()
list_of_m_size_mean = []
list_of_m_size_std = []
for m in tqdm(sample_sizes):
    list_of_res = []
    for _ in range(averaging):
        X_m, y_m = data.sample(m=m, duplications=False)
        res = model(X_m, y_m)
        list_of_res.append(res['m*'])
    list_of_m_size_mean.append(np.mean(list_of_res))
    list_of_m_size_std.append(np.std(list_of_res))
    
result['list_of_m_size_mean'] = np.array(list_of_m_size_mean)
result['list_of_m_size_std'] = np.array(list_of_m_size_std)

100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [10]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'wb') as f:
    pickle.dump(result, f)

In [11]:
with open('dumps/DependingOnAvailableSampleSizeExperiment/{}.pkl'.format(key), 'rb') as f:
    result = pickle.load(f)