In [1]:
!pip install subgroups


Collecting subgroups
  Downloading subgroups-0.1.8-py3-none-any.whl.metadata (6.4 kB)
Collecting bitarray>=2.7.6 (from subgroups)
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading subgroups-0.1.8-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.0/255.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray, subgroups
Successfully installed bitarray-3.0.0 subgroups-0.1.8


In [None]:
import subgroups.tests as st
st.run_all_tests()


In [6]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))
# Knihovna pozaduje nominalni hodnoty
# DatasetAttributeTypeError: Error in attribute 'longitude'. This algorithm only supports nominal attributes (i.e., type 'str').

housing = load_housing_data()
housing['longitude'] = housing['longitude'].astype(str)
housing['latitude'] = housing['latitude'].astype(str)
housing['ocean_proximity'] = housing['ocean_proximity'].astype(str)
housing['households'] = housing['households'].astype(str)
housing['total_rooms'] = housing['total_rooms'].astype(str)
housing['total_bedrooms'] = housing['total_bedrooms'].astype(str)
housing['population'] = housing['population'].astype(str)
housing['median_income'] = housing['median_income'].astype(str)
housing['median_house_value'] = housing['median_house_value'].astype(str)
housing['housing_median_age'] = housing['housing_median_age'].astype(str)

# Fitovani na cely dataset trvalo moc dlouho
housing_reduced = housing.iloc[0:1000]
housing_reduced.info()
# housing_reduced.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   longitude           1000 non-null   object
 1   latitude            1000 non-null   object
 2   housing_median_age  1000 non-null   object
 3   total_rooms         1000 non-null   object
 4   total_bedrooms      1000 non-null   object
 5   population          1000 non-null   object
 6   households          1000 non-null   object
 7   median_income       1000 non-null   object
 8   median_house_value  1000 non-null   object
 9   ocean_proximity     1000 non-null   object
dtypes: object(10)
memory usage: 78.2+ KB


In [7]:
from subgroups.quality_measures import WRAcc
from subgroups.quality_measures import WRAccOptimisticEstimate1
from subgroups.algorithms import VLSD
from subgroups.algorithms import SDMapStar
from subgroups.algorithms import SDMap
from subgroups.algorithms import DSLM
from subgroups.algorithms import GMSL
from subgroups.algorithms import QFinder
from subgroups.algorithms import BSD
from subgroups.utils.file_format_transformations import to_input_format_for_subgroup_list_algorithms

dataset = housing_reduced
target = ('ocean_proximity', 'NEAR BAY')

# Parametery jsou nastaveny stejne jako u ukazek na githubu Subgroups
vlsd = VLSD(quality_measure = WRAcc(), q_minimum_threshold  = -1, optimistic_estimate = WRAccOptimisticEstimate1(), oe_minimum_threshold = -1, sort_criterion_in_s1 = VLSD.SORT_CRITERION_NO_ORDER, sort_criterion_in_other_sizes = VLSD.SORT_CRITERION_NO_ORDER, vertical_lists_implementation = VLSD.VERTICAL_LISTS_WITH_BITSETS, write_results_in_file = True, file_path = "./vlsd_result.txt")
vlsd.fit(dataset, target)

In [8]:
from subgroups.algorithms import DSLM
from subgroups.utils.file_format_transformations import to_input_format_for_subgroup_list_algorithms


subgroups_correctly_read, subgroups_not_correctly_read = to_input_format_for_subgroup_list_algorithms("./vlsd_result.txt", "./vlsd_result_transformed.txt")


# Ted pouzijeme DSLM pro nalezeni top-k subgroup listy
dslm_model = DSLM(input_file_path = "./vlsd_result_transformed.txt",
                  max_sl = 3,
                  sl_max_size = 10,
                  beta = 0.0,
                  maximum_positive_overlap = 0.06,
                  maximum_negative_overlap = 0.06,
                  output_file_path = "dslm_result.txt")
dslm_model.fit(dataset, target)

In [None]:
dataset = housing_reduced
target = ('ocean_proximity', 'NEAR BAY')

# Parametery jsou nastaveny stejne jako u ukazek na githubu Subgroups

#vlsd pozaduje nominal values
vlsd = VLSD(quality_measure = WRAcc(), q_minimum_threshold  = -1, optimistic_estimate = WRAccOptimisticEstimate1(), oe_minimum_threshold = -1, sort_criterion_in_s1 = VLSD.SORT_CRITERION_NO_ORDER, sort_criterion_in_other_sizes = VLSD.SORT_CRITERION_NO_ORDER, vertical_lists_implementation = VLSD.VERTICAL_LISTS_WITH_BITSETS, write_results_in_file = True, file_path = "./vlsd_result.txt")
#sdmap pozaduje nominal values
sdmap = SDMap(quality_measure = WRAcc(), minimum_quality_measure_value = -1, minimum_n = 0, write_results_in_file = True, file_path = "./sdmap_results.txt")
# sdmapstar pozaduje pocet hledanych subgroup
sdmapstar = SDMapStar(WRAcc(), WRAccOptimisticEstimate1(), 0.01, num_subgroups=3, minimum_n = 0, write_results_in_file=True, file_path="./sdmapstar_results.txt")
bsd = BSD(min_support=0, quality_measure=WRAcc(), optimistic_estimate = WRAccOptimisticEstimate1(), num_subgroups=600, max_depth=100, write_results_in_file = True, file_path = "./bsd_results.txt" )

methods = [vlsd, sdmap, sdmapstar, bsd]
for method in methods:
    method.fit(dataset, target)

KeyboardInterrupt: 

**Subgroups found by each method:**

In [None]:
for i in methods:
    print("Subgroups selected by " + str(i) + " : ", i.selected_subgroups)
  # print("Unselected subgroups: ", sdmap.unselected_subgroups)
  # print("Visited nodes: ", sdmap.visited_nodes)

Subgroups selected by <subgroups.algorithms.subgroup_sets.vlsd.VLSD object at 0x7f166e18ee60> :  506899
Subgroups selected by <subgroups.algorithms.subgroup_sets.sdmap.SDMap object at 0x7f166e197990> :  506899
Subgroups selected by <subgroups.algorithms.subgroup_sets.sdmapstar.SDMapStar object at 0x7f1665ac1620> :  11
Subgroups selected by <subgroups.algorithms.subgroup_sets.bsd.BSD object at 0x7f1665ac14e0> :  8


**Subgroup discovery (SD)**

from VLSD paper:

>Moreover, the populations covered
by different subgroups **may overlap**

> One disadvantage of the SD technique is the huge number of subgroups that could be
generated (i.e., pattern explosion), and it is especially relevant when using input datasets
with too many attributes. For this reason, the utilization of an optimistic estimate provides
a solution of this problem when the quality measure threshold established allows not to
explore a large part of the search space.


> It is sometimes possible that two subgroups generated by a specific SD algorithm are
redundant, because they represent and explain the same portion of data from a specific
dataset.





**VLSD (Vertical List Subgroup Discovery)**

paper: https://www.mdpi.com/1999-4893/16/6/274

> Differences between this technique and others, such as clustering, pattern mining, or classification.
Clustering and pattern mining algorithms are unsupervised and do not use an output attribute or class, while SD algorithms are supervised and generate relations (called subgroups) with respect to a **target attribute.**




*// na konci clanku je benchmark VLSD vs SD-Map vs BSD vs CBSD vs CPBSD strana 16-17*


**SD-Map**

paper: https://link.springer.com/chapter/10.1007/11871637_6


>  an exhaustive search method, uses frequent pattern trees (FP-Tree)



**SD-Map***

paper: https://link.springer.com/chapter/10.1007/978-3-642-04125-9_7
//od 49 strany




>SD-Map* algorithm as a novel adaptation of the efficient SD-Map algorithm.
Efficiently adapt exhaustive subgroup discovery for continuous target concepts.
Basic principle of optimistic estimates is to safely prune parts of the search space...



**Q-finder**

paper: https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2020.559927/full



>  It combines an
exhaustive search with a cascade of filters based on metrics assessing key credibility criteria,
including relative risk reduction assessment, adjustment on confounding factors, individual
feature’s contribution to the subgroup’s effect, interaction tests for assessing betweensubgroup treatment effect interactions and tests adjustment (multiple testing).



**BSD, Closed BSD and Closed on the positives BSD**

paper: https://cdn.aaai.org/ocs/1262/1262-7800-1-PB.pdf


> BSD is a subgroup discovery algorithm that introduces the concept of dominance relation between subgroups. This algorithm also uses a list of the
 best subgroups along with an optimistic estimation to prune the search space.





**DSLM (Diverse Subgroup Lists Miner)**

paper: https://doi.org/10.1007/978-3-031-34344-5_6 //strana 45-50


> generates subgroup
lists based on the subgroup discovery paradigm and the minimum description
length principle



**GMSL (Generation of Multiple Subgroup Lists)**

paper: https://doi.org/10.1007/978-3-031-30047-9_21 //strana 262-273


>  an algorithm that takes a set of
pre-computed subgroup candidates as input and returns a collection of diverse
top-k subgroup lists

