# Prepare github data
It is necessary to run when you running in google colab

In [1]:
# !git clone https://github.com/andriygav/SampleSizeLib
# !pip install SampleSizeLib/src
# !git clone https://github.com/ttgadaev/SampleSizeEstimation.git
# !mv SampleSizeEstimation/datasets datasets

# Import packages

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

In [3]:
from samplesizelib.linear.statistical import LagrangeEstimator
from samplesizelib.linear.statistical import LikelihoodRatioEstimator
from samplesizelib.linear.statistical import WaldEstimator

from samplesizelib.linear.heuristic import CrossValidationEstimator
from samplesizelib.linear.heuristic import BootstrapEstimator
from samplesizelib.linear.heuristic import LogisticRegressionEstimator

from samplesizelib.linear.bayesian import APVCEstimator
from samplesizelib.linear.bayesian import ACCEstimator
from samplesizelib.linear.bayesian import ALCEstimator
from samplesizelib.linear.bayesian import MaxUtilityEstimator
from samplesizelib.linear.bayesian import KLEstimator

from samplesizelib.linear.models import RegressionModel
from samplesizelib.linear.models import LogisticModel

In [4]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

# Experiment config for all models

In [5]:
NAME_TO_MODEL = {
    'LagrangeEstimator': LagrangeEstimator, 
    'LikelihoodRatioEstimator': LikelihoodRatioEstimator, 
    'WaldEstimator': WaldEstimator, 
    'CrossValidationEstimator': CrossValidationEstimator, 
    'BootstrapEstimator': BootstrapEstimator, 
    'LogisticRegressionEstimator': LogisticRegressionEstimator, 
    'APVCEstimator': APVCEstimator, 
    'ACCEstimator': ACCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'ALCEstimator': ALCEstimator, 
    'MaxUtilityEstimator': MaxUtilityEstimator, 
}

config = {
    'LagrangeEstimator': {'epsilon': 0.3, 
                          'alpha': 0.05, 
                          'beta': 0.05},
    'LikelihoodRatioEstimator': {'epsilon': 0.3, 
                                 'alpha': 0.05, 
                                 'beta': 0.05},
    'WaldEstimator': {'epsilon': 0.3, 
                      'alpha': 0.05, 
                      'beta': 0.05},
    'BootstrapEstimator': {'averaging': 10,
                           'epsilon': 0.5, 
                           'multiprocess': True, 
                           'progressbar': True},
    'CrossValidationEstimator': {'averaging': 10,
                                 'test_size': 0.5, 
                                 'epsilon': 0.05, 
                                 'multiprocess': True, 
                                 'progressbar': True},
    'APVCEstimator': {'averaging': 10,
                      'epsilon': 0.5, 
                      'multiprocess': True, 
                      'progressbar': True},
    'ACCEstimator': {'averaging': 10,
                     'length': 0.25,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': True},
    'ALCEstimator': {'averaging': 10,
                     'length': 0.5,
                     'alpha': 0.05, 
                     'multiprocess': True, 
                     'progressbar': True},
    'MaxUtilityEstimator': {'averaging': 10,
                            'c': 0.5, 
                            'multiprocess': True, 
                            'progressbar': True},
}

In [6]:
def draw_table(data, title = ["PRECISION", "RECALL", "F-SCORE"], width = [20]*(1+3)):
    """
    data is a dict with format
        {row_name_1: (title[0], title[1], ...), 
         row_name_2: (title[0], title[1], ...), 
         ...}
    """
    print('#'*(sum(width) + len(width) + 1))
    
    row_format = '|' + '|'.join([("{:>"+str(w)+"}") for w in width]) + '|'
    
    print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    print(row_format.format("", *title))
    print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    for key in data:
        if len(key) > width[0]:
            row_name = '...' + key[len(key)-width[0]+3:]
        else:
            row_name = key
        print(row_format.format(row_name, *[round(x, 2) for x in data[key]]))
        print(row_format.format("-"*width[0], *["-"*width[i+1] for i, _ in enumerate(title)]))
    
    print('#'*(sum(width) + len(width) + 1))

# Abalone

Load dataset

In [7]:
dataset = pd.read_csv('datasets/abalone.csv')

y = dataset.iloc[:, dataset.columns == 'answer'].values.reshape(-1)
X = dataset.iloc[:, dataset.columns != 'answer'].values

X = scale(X)
X = np.hstack((X, np.ones([len(X), 1])))

In [8]:
statmodel = LogisticModel

models = dict()
for key in config:
    models[key] = NAME_TO_MODEL[key](statmodel, **config[key])

In [9]:
# result = dict()

# for i, key in enumerate(models):
#     result[key] = models[key](X, y)
#     print('{}: {}'.format(key, result[key]['m*']))

In [10]:
# with open('dumps/ManyDatasetsExperiment/abalone.pkl', 'wb') as f:
#     pickle.dump(result, f)

In [11]:
with open('dumps/ManyDatasetsExperiment/abalone.pkl', 'rb') as f:
    result = pickle.load(f)

In [12]:
table_data = dict()
for key in result:
    table_data[key] = [result[key]['m*']]

draw_table(table_data, title=['m*'], width=[30, 20])

#####################################################
|------------------------------|--------------------|
|                              |                  m*|
|------------------------------|--------------------|
|             LagrangeEstimator|                1315|
|------------------------------|--------------------|
|      LikelihoodRatioEstimator|                 329|
|------------------------------|--------------------|
|                 WaldEstimator|                1092|
|------------------------------|--------------------|
|            BootstrapEstimator|                2869|
|------------------------------|--------------------|
|      CrossValidationEstimator|                  63|
|------------------------------|--------------------|
|                 APVCEstimator|                2573|
|------------------------------|--------------------|
|                  ACCEstimator|                2869|
|------------------------------|--------------------|
|                  ALCEstima