In [19]:
import pystan
import arviz
import random
import pandas as pd
import math

In [30]:
# this method simulates the survey of cheaters
def do_exprinment(is_cheater):
    coin_result = flip_coin()
    if (coin_result):
        return is_cheater
    else:
        return flip_coin()

# this function simulates a coin flip
def flip_coin():
    result = False;
    rand_value = random.random();
    if (rand_value > 0.5):
        result = True;
    return result

# this function simulates num_of_people that answer the survey , with num of cheater as <num_of_cheaters>
def compute_result_to_list(num_of_cheaters, num_of_people):
    results = []
    all_population = range(num_of_people)
    cheaters_list = random.sample(all_population , k=num_of_cheaters)
    for i in all_population:
        result = False
        if i in cheaters_list:
            result = do_exprinment(True)
        else :
            result = do_exprinment(False)
        if (result):
            results.append(1)
        else:
            results.append(0)
            
    return results

# returns the mean of theta on the results
def theta_mean(fit): 
    summary_dict = fit.summary()
    df = pd.DataFrame(summary_dict['summary'], 
                  columns=summary_dict['summary_colnames'], 
                  index=summary_dict['summary_rownames'])

    return df['mean']['theta']

"""" 
this method sampling from the <stan_model> on varity of number_of_cheaters from <num_of_population>
and prints the accuracy of the inference
"""
def do_benchmark(step_size, num_of_population, stan_model):
    num_of_iterations = math.floor(num_of_population / step_size)
    for i in range(num_of_iterations):
        y = compute_result_to_list(step_size + i * step_size , num_of_population)
        exp = {
            'N': len(y),
            'y': y
        }
        m_fit = stan_model.sampling(data=exp, iter=1000, chains=4, control = {"adapt_delta" : 0.9})
        calculated_mean = theta_mean(m_fit)
        cheaters_ratio = ((step_size + i * step_size )/ num_of_population)
        error = abs(calculated_mean - cheaters_ratio)
        print("the calculated mean is " + str(calculated_mean) + " the cheaters ratio is " + str(cheaters_ratio)  
              + " size of population is " + str(num_of_population) 
             + " the error is " + "{0:.00%}".format(error) + "( " + str(error) + " )" )
        
    

In [3]:
generative_model = """

data {
    int<lower=0> N; // number of people answered the survey
    int<lower=0, upper = 1> y[N]; // boolean array of answers
}

parameters {
    real<lower=0, upper=1> theta; // the latent variable we want to infer
    real<lower=-1, upper=1> coin_results[N]; // helper coin results buffer
}


model {
    theta ~ beta(0.5, 0.5); // beta prior
    for (i in 1:N){
        coin_results[i] ~ normal(0 , 1);
        if (coin_results[i] >= 0){
            y[i] ~ bernoulli(theta);
        }
        else{
            y[i] ~ bernoulli(0.5);
        }
    }
    
}
"""

In [4]:
mixture_model = """

data {
    int<lower=0> N; // number of people answered the survey
    int<lower=0, upper = 1> y[N]; // boolean array of answers
}

parameters {
    real<lower=0, upper=1> theta; // the latent variable we want to infer
}


model {
    theta ~ beta(0.5, 0.5); // beta prior
    for (i in 1:N){
        target +=
            log_mix(0.5, bernoulli_lpmf(y[i] | theta), bernoulli_lpmf(y[i] | 0.5));
    }
    
}
"""

In [36]:
g_sm = pystan.StanModel(model_code=generative_model)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_10e97b055c992532f4f5189026ef2af5 NOW.


In [5]:
m_sm = pystan.StanModel(model_code=mixture_model)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_1faba59f95facd8bf45fceea85f1353e NOW.


In [31]:
do_benchmark(15, 100 , m_sm)

the calculated mean is 0.06427257276492004 the cheaters ratio is 0.15 size of population is 100 the error is 9%( 0.08572742723507995 )
the calculated mean is 0.24854318499939307 the cheaters ratio is 0.3 size of population is 100 the error is 5%( 0.05145681500060692 )
the calculated mean is 0.6195350489144328 the cheaters ratio is 0.45 size of population is 100 the error is 17%( 0.16953504891443277 )
the calculated mean is 0.5803708215983651 the cheaters ratio is 0.6 size of population is 100 the error is 2%( 0.019629178401634917 )
the calculated mean is 0.7034016103391985 the cheaters ratio is 0.75 size of population is 100 the error is 5%( 0.04659838966080154 )
the calculated mean is 0.8463246319869586 the cheaters ratio is 0.9 size of population is 100 the error is 5%( 0.0536753680130414 )


In [33]:
do_benchmark(150 , 1000 , m_sm)

the calculated mean is 0.1707341798698987 the cheaters ratio is 0.15 size of population is 1000 the error is 2%( 0.0207341798698987 )
the calculated mean is 0.29589039114410626 the cheaters ratio is 0.3 size of population is 1000 the error is 0%( 0.004109608855893732 )
the calculated mean is 0.44690984178326026 the cheaters ratio is 0.45 size of population is 1000 the error is 0%( 0.0030901582167397468 )
the calculated mean is 0.5721991132623454 the cheaters ratio is 0.6 size of population is 1000 the error is 3%( 0.027800886737654573 )
the calculated mean is 0.7494127323459021 the cheaters ratio is 0.75 size of population is 1000 the error is 0%( 0.0005872676540978894 )
the calculated mean is 0.9110558985767093 the cheaters ratio is 0.9 size of population is 1000 the error is 1%( 0.0110558985767093 )


In [32]:
do_benchmark(500 , 5000 , m_sm)

the calculated mean is 0.1082499711717486 the cheaters ratio is 0.1 size of population is 5000 the error is 1%( 0.008249971171748596 )
the calculated mean is 0.20363263219832692 the cheaters ratio is 0.2 size of population is 5000 the error is 0%( 0.003632632198326907 )
the calculated mean is 0.3207568066464911 the cheaters ratio is 0.3 size of population is 5000 the error is 2%( 0.0207568066464911 )
the calculated mean is 0.40503132654480384 the cheaters ratio is 0.4 size of population is 5000 the error is 1%( 0.005031326544803816 )
the calculated mean is 0.48248406047449116 the cheaters ratio is 0.5 size of population is 5000 the error is 2%( 0.01751593952550884 )
the calculated mean is 0.6142683778307314 the cheaters ratio is 0.6 size of population is 5000 the error is 1%( 0.014268377830731427 )
the calculated mean is 0.7009099151302469 the cheaters ratio is 0.7 size of population is 5000 the error is 0%( 0.0009099151302469632 )
the calculated mean is 0.8130424756263488 the cheaters



the calculated mean is 0.9894600857309536 the cheaters ratio is 1.0 size of population is 5000 the error is 1%( 0.010539914269046435 )


In [34]:
g_fit = g_sm.sampling(data=exp_1, iter=1000, chains=4, control = {"adapt_delta" : 0.9 , 'max_treedepth': 11})

NameError: name 'g_sm' is not defined