In [None]:
import numpy as np
import pandas

from graphing.sampling_graphs import create_sample_distribution_graph_groupings, create_grouping_influence_graph, create_sample_distribution_graph
from learning.grouped_learning import Berta
from learning.grouped_linear_regression import Gustav
from model.datasets import Datasets, SynDataset
from model.synth_data_factory import SyntheticDataBuilder
from sampling.from_tests_sampler import FromTestsSamplingStrategy
from sampling.group_sampling.hamming_group_sampling_strategy import HammingGroupSamplingStrategy
from sampling.random_sampling.better_random_sampling_strategy import BetterRandomSamplingStrategy
from sampling.random_sampling.distance_sampling_strategy import DistanceSamplingStrategy
from sampling.sampler import Sampler
from testing.grouped_tester import GroupedTester
from testing.grouped_tester_flipping import GroupedTesterSigns
from testing.tester import Tester
from util.util import get_samples_with_results, get_regression_metricts, make_df_representation
from sampling.group_sampling.mutex_aware_group_sampling_strategy import IndependentFeatureGroupSamplingStrategy
from util.util import get_samples_with_results_full
import logging

numba_logger = logging.getLogger('numba')
numba_logger.setLevel(logging.WARNING)
pandas.set_option("display.max_columns", None)
%load_ext autoreload
%autoreload 2

# Data Generation

In [None]:
data_builder = SyntheticDataBuilder(
    parameters=10,
    influential_parameters=1,
    mutex_groups=[],
    mutex_required=[],
    constrains=0,
    interactions=0
)
data_builder.print_overview()
influential_params = [i - 1 for i in data_builder.influential_features]
vm, test_strategy = data_builder.get_vm(), data_builder.get_testing_strategy()
# vm, test_strategy = Datasets().get_dataset('javagc')


In [None]:
sampler = Sampler(DistanceSamplingStrategy(vm))
logging.getLogger().setLevel(logging.DEBUG)
#sampler_test = Sampler(FromTestsSamplingStrategy(vm, test_strategy))
sampler_test = Sampler(DistanceSamplingStrategy(vm))
tester = Tester(test_strategy)
x_test, y_test, samples_test = get_samples_with_results_full(sampler_test, tester, 100)
create_sample_distribution_graph(samples_test, vm)

In [None]:
group_size = 5
strategy = IndependentFeatureGroupSamplingStrategy(vm, group_size=group_size) #, load_mutex="./result/tmp/syn-1000-pre.json")

In [None]:

grouped_sampler = Sampler(strategy)
grouped_tester = GroupedTester(test_strategy)
logging.getLogger().setLevel(logging.DEBUG)
grouped_sampler.reset()

x, y, samples = get_samples_with_results_full(grouped_sampler, grouped_tester, 5)
print(x,y,samples)
#display((make_df_representation(x, y)))
create_sample_distribution_graph_groupings(samples, vm, [])

In [None]:
df = make_df_representation(x, y)
df = df[df.index % group_size == 0]
from scipy.spatial.distance import pdist, squareform

distances = pdist(df.values, metric='hamming')
dist_matrix = squareform(distances)
display(pandas.DataFrame(dist_matrix))

In [None]:
create_grouping_influence_graph(x, y, group_size)

In [None]:
from learning.grouped_learning_with import Soeren

berta = Soeren(vm, group_size)
logging.getLogger().setLevel(logging.DEBUG)
berta.fit(x, y)
co = pandas.DataFrame(berta.model.coef_)
ax = co.transpose().plot.bar()
# ax.set_xticklabels(vm.get_features())
y_predicted = berta.predict(x_test)
metrics = get_regression_metricts(y_test, y_predicted)
comp = pandas.DataFrame()
comp['Test'] = y_test[:50]
comp['Predicted'] = y_predicted[:50]
#comp.plot.bar()
#display(comp)
print(metrics)

In [None]:
berta = Berta(vm, group_size)
logging.getLogger().setLevel(logging.DEBUG)
berta.fit(x, y)
co = pandas.DataFrame(berta.model.coef_)
ax = co.transpose().plot.bar()
# ax.set_xticklabels(vm.get_features())
y_predicted = berta.predict(x_test)
metrics = get_regression_metricts(y_test, y_predicted)
comp = pandas.DataFrame()
comp['Test'] = y_test[:50]
comp['Predicted'] = y_predicted[:50]
#comp.plot.bar()
#display(comp)
print(metrics)