# Prepare github data
It is necessary to run when you running in google colab

In [1]:
# !git clone https://github.com/andriygav/SampleSizeLib
# !pip install SampleSizeLib/src
# !git clone https://github.com/ttgadaev/SampleSizeEstimation.git
# !mv SampleSizeEstimation/datasets datasets
# !mv SampleSizeEstimation/dumps dumps

# Import packages

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

In [3]:
from samplesizelib.linear.statistical import LagrangeEstimator
from samplesizelib.linear.statistical import LikelihoodRatioEstimator
from samplesizelib.linear.statistical import WaldEstimator

from samplesizelib.linear.heuristic import CrossValidationEstimator
from samplesizelib.linear.heuristic import BootstrapEstimator
from samplesizelib.linear.heuristic import LogisticRegressionEstimator

from samplesizelib.linear.bayesian import APVCEstimator
from samplesizelib.linear.bayesian import ACCEstimator
from samplesizelib.linear.bayesian import ALCEstimator
from samplesizelib.linear.bayesian import MaxUtilityEstimator
from samplesizelib.linear.bayesian import KLEstimator

from samplesizelib.linear.models import RegressionModel
from samplesizelib.linear.models import LogisticModel

In [4]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

# Experiment config for all models

In [5]:
NAME_TO_MODEL = {
    'LagrangeEstimator': LagrangeEstimator, 
    'LikelihoodRatioEstimator': LikelihoodRatioEstimator, 
    'WaldEstimator': WaldEstimator, 
    'CrossValidationEstimator': CrossValidationEstimator, 
    'BootstrapEstimator': BootstrapEstimator, 
    'LogisticRegressionEstimator': LogisticRegressionEstimator, 
    'APVCEstimator': APVCEstimator, 
    'ACCEstimator': ACCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'MaxUtilityEstimator': MaxUtilityEstimator, 
}

config = {
    'LagrangeEstimator': {'epsilon': 0.2, 
                          'alpha': 0.05, 
                          'beta': 0.2},
    'LikelihoodRatioEstimator': {'epsilon': 0.2, 
                                 'alpha': 0.05, 
                                 'beta': 0.2},
    'WaldEstimator': {'epsilon': 0.2, 
                      'alpha': 0.05, 
                      'beta': 0.2},
    'BootstrapEstimator': {'averaging': 10,
                           'epsilon': 0.5, 
                           'multiprocess': True, 
                           'progressbar': True},
    'CrossValidationEstimator': {'averaging': 100,
                                 'test_size': 0.5, 
                                 'epsilon': 0.05, 
                                 'multiprocess': True, 
                                 'progressbar': True},
    'APVCEstimator': {'averaging': 100,
                      'epsilon': 0.5, 
                      'multiprocess': True, 
                      'progressbar': True},
    'ACCEstimator': {'averaging': 100,
                     'length': 0.25,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': True},
    'ALCEstimator': {'averaging': 100,
                     'length': 0.5,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': True},
    'MaxUtilityEstimator': {'averaging': 10,
                            'c': 0.005, 
                            'multiprocess': True, 
                            'progressbar': True},
}

In [6]:
def draw_table(data, title = ["PRECISION", "RECALL", "F-SCORE"], width = [20]*(1+3)):
    """
    data is a dict with format
        {row_name_1: (title[0], title[1], ...), 
         row_name_2: (title[0], title[1], ...), 
         ...}
    """
    print('#'*(sum(width) + len(width) + 1))
    
    row_format = '|' + '|'.join([("{:>"+str(w)+"}") for w in width]) + '|'
    
    print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    print(row_format.format("", *title))
    print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    for key in data:
        if len(key) > width[0]:
            row_name = '...' + key[len(key)-width[0]+3:]
        else:
            row_name = key
        print(row_format.format(row_name, *[round(x, 2) for x in data[key]]))
        print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    
    print('#'*(sum(width) + len(width) + 1))

# Boston Housing

## Load dataset

In [7]:
dataset = pd.read_csv('datasets/boston.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
y = scale(y)
X = np.hstack((X, np.ones([len(X), 1])))

## Prepara model

In [8]:
statmodel = RegressionModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

## Using models for prediction

In [9]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

## Using dumps

In [10]:
# with open('dumps/ManyDatasetsExperiment/boston.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [11]:
with open('dumps/ManyDatasetsExperiment/boston.pkl', 'rb') as f:
    result = pickle.load(f)

## Rusult for Boston dataset

In [12]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                  18|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                  17|
|------------------------------|--------------------|
|                 WaldEstimator|                  66|
|------------------------------|--------------------|
|            BootstrapEstimator|                 178|
|------------------------------|--------------------|
|      CrossValidationEstimator|                 113|
|------------------------------|--------------------|
|                 APVCEstimator|                  98|
|------------------------------|--------------------|
|                  ACCEstimator|                 228|
|------------------------------|--------------------|
|                  ALCEstima

# Diabetes

## Load dataset

In [13]:
dataset = pd.read_csv('datasets/diabetes.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
y = scale(y)
X = np.hstack((X, np.ones([len(X), 1])))

## Prepara model

In [14]:
statmodel = RegressionModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

## Using models for prediction

In [15]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

## Using dumps

In [16]:
# with open('dumps/ManyDatasetsExperiment/diabetes.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [17]:
with open('dumps/ManyDatasetsExperiment/diabetes.pkl', 'rb') as f:
    result = pickle.load(f)

## Rusult for Diabetes dataset

In [18]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                  25|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                  25|
|------------------------------|--------------------|
|                 WaldEstimator|                  51|
|------------------------------|--------------------|
|            BootstrapEstimator|                 441|
|------------------------------|--------------------|
|      CrossValidationEstimator|                 117|
|------------------------------|--------------------|
|                 APVCEstimator|                 167|
|------------------------------|--------------------|
|                  ACCEstimator|                 441|
|------------------------------|--------------------|
|                  ALCEstima

# Forest Fires

## Load dataset

In [19]:
dataset = pd.read_csv('datasets/forestfires.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
y = scale(y)
X = np.hstack((X, np.ones([len(X), 1])))

## Prepara model

In [20]:
statmodel = RegressionModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

## Using models for prediction

In [21]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

## Using dumps

In [22]:
# with open('dumps/ManyDatasetsExperiment/forestfires.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [23]:
with open('dumps/ManyDatasetsExperiment/forestfires.pkl', 'rb') as f:
    result = pickle.load(f)

## Rusult for Forest Fires dataset

In [24]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                  44|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                  43|
|------------------------------|--------------------|
|                 WaldEstimator|                  46|
|------------------------------|--------------------|
|            BootstrapEstimator|                 171|
|------------------------------|--------------------|
|      CrossValidationEstimator|                  86|
|------------------------------|--------------------|
|                 APVCEstimator|                 351|
|------------------------------|--------------------|
|                  ACCEstimator|                 346|
|------------------------------|--------------------|
|                  ALCEstima

# Servo

## Load dataset

In [25]:
dataset = pd.read_csv('datasets/servo.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
y = scale(y)
X = np.hstack((X, np.ones([len(X), 1])))

## Prepara model

In [26]:
statmodel = RegressionModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

## Using models for prediction

In [27]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

## Using dumps

In [28]:
# with open('dumps/ManyDatasetsExperiment/servo.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [29]:
with open('dumps/ManyDatasetsExperiment/servo.pkl', 'rb') as f:
    result = pickle.load(f)

## Rusult for Servo dataset

In [30]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                  38|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                  18|
|------------------------------|--------------------|
|                 WaldEstimator|                  76|
|------------------------------|--------------------|
|            BootstrapEstimator|                 120|
|------------------------------|--------------------|
|      CrossValidationEstimator|                  60|
|------------------------------|--------------------|
|                 APVCEstimator|                  20|
|------------------------------|--------------------|
|                  ACCEstimator|                  65|
|------------------------------|--------------------|
|                  ALCEstima

# NBA

## Load dataset

In [31]:
dataset = pd.read_csv('datasets/nba.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
X = np.hstack((X, np.ones([len(X), 1])))

## Prepara model

In [32]:
statmodel = LogisticModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

## Using models for prediction

In [33]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

## Using dumps

In [34]:
# with open('dumps/ManyDatasetsExperiment/nba.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [35]:
with open('dumps/ManyDatasetsExperiment/nba.pkl', 'rb') as f:
    result = pickle.load(f)

## Rusult for NBA dataset

In [36]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                 218|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                 110|
|------------------------------|--------------------|
|                 WaldEstimator|                 200|
|------------------------------|--------------------|
|            BootstrapEstimator|                1328|
|------------------------------|--------------------|
|      CrossValidationEstimator|                 405|
|------------------------------|--------------------|
|                 APVCEstimator|                1328|
|------------------------------|--------------------|
|                  ACCEstimator|                1328|
|------------------------------|--------------------|
|                  ALCEstima