In [2]:
from pyarc import CBA, TransactionDB
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import collections
import random
import fim
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
import numpy as np
import time
from pyarc.algorithms import (
    top_rules,
    createCARs,
    generateCARs,
    M1Algorithm,
    M2Algorithm,
    Classifier
)
from rich.progress import (Progress, SpinnerColumn)

<h1>Functions definitions</h1>

In [3]:
def k_fold_cross_validate(dataset, min_sup, min_conf, folds=10):
    kf = KFold(n_splits=folds, shuffle=True, random_state=6)
    accuracy = []
    data = dataset.dataframe
    split = kf.split(data)

    for train_indices, test_indices in split:
        train_data = data.iloc[train_indices]
        test_data = data.iloc[test_indices]
        #handling .csv when target column isn't the last one
        if(dataset.target != ""):
            txns_train = TransactionDB.from_DataFrame(train_data, target=dataset.target)
            txns_test = TransactionDB.from_DataFrame(test_data, target=dataset.target)
        else:
            txns_train = TransactionDB.from_DataFrame(train_data)
            txns_test = TransactionDB.from_DataFrame(test_data)
        appear = txns_train.appeardict
        rules = fim.apriori(txns_train.string_representation, supp=min_sup, conf=min_conf, mode="o", target="r", report="sc", appear=txns_train.appeardict, zmax=10)
        cars = createCARs(rules)
        clf = M1Algorithm(cars, txns_train).build()
        accuracy.append(clf.test_transactions(txns_test))
        
    return accuracy, clf

In [4]:
#Dataset is class which has pandas dataframe with handled nan values
class Dataset:
    def __init__(self, name, path, na_values = "", target = ""):
        self.name = name
        self.path = path
        self.target = target
        df = pd.read_csv(path, na_values=na_values)
        self.dataframe = handle_nans(df)
        self.min_rule_supp = 0
        self.max_rule_supp = 0

In [5]:
#Replacing missing values with most common value in column
def handle_nans(df):
    if(df.isna().values.any()):
        nan_cols = df.columns[df.isna().any()].tolist()
        for col in nan_cols:
            df[str(col)].fillna(df[str(col)].mode()[0], inplace = True)
    return df

In [6]:
def process_data(datasets, min_sup, min_conf):
        print(f"Processing datasets with following config: Min_supp: {min_sup}%, Min_conf: {min_conf}%")
        dictionary = {
            "dataset_name" : [],
            "rows_count": [],
            "att_count": [],
            "rules_count": [],
            "min_rule_len": [],
            "avg_rule_len": [],
            "avg_rule_len_attr_ratio": [],
            "max_rule_len": [],
            "min_rule_supp": [],
            "avg_rule_supp": [],
            "max_rule_supp": []
        }

        for dataset in datasets:
            data = dataset.dataframe
            
            if(dataset.target != ""):
                txns = TransactionDB.from_DataFrame(data, target=dataset.target)
            else:
                txns = TransactionDB.from_DataFrame(data)
            rules = fim.apriori(txns.string_representation, supp=min_sup, conf=min_conf, mode="o", target="r", report="sc", appear=txns.appeardict, zmax=10)
            cars = createCARs(rules)
            
            dictionary["dataset_name"].append(dataset.name)
            dictionary["rows_count"].append(len(data))
            dictionary["att_count"].append(data.shape[1]-1)
            dictionary["rules_count"].append(len(rules))
            dictionary["min_rule_len"].append(min(cars, key=lambda x: x.rulelen).rulelen)
            dictionary["avg_rule_len"].append(round((sum(c.rulelen for c in cars)/len(cars)), 2))#przez liczbe atrybutów
            dictionary["avg_rule_len_attr_ratio"].append(round((sum(c.rulelen for c in cars)/len(cars)/(data.shape[1]-1)), 2))#przez liczbe atrybutów
            dictionary["max_rule_len"].append(max(cars, key=lambda x: x.rulelen).rulelen)
            
            min_rule_supp = round(min(cars, key=lambda x: x.support).support * 100, 2)
            dictionary["min_rule_supp"].append(min_rule_supp)
            dataset.min_rule_supp = min_rule_supp
            
            dictionary["avg_rule_supp"].append(round((sum(c.support for c in cars)/len(cars))*100, 2))
            
            max_rule_supp = round(max(cars, key=lambda x: x.support).support * 100, 2)
            dictionary["max_rule_supp"].append(max_rule_supp)
            dataset.max_rule_supp = max_rule_supp


        datasets_df = pd.DataFrame(dictionary)
        datasets_df = datasets_df[["dataset_name", "rows_count", "att_count", "rules_count", "min_rule_len", "avg_rule_len", "avg_rule_len_attr_ratio", "max_rule_len", "min_rule_supp", "avg_rule_supp", "max_rule_supp"]]
        
        display(datasets_df)
        return datasets_df

In [13]:
def test_data(datasets, min_sup, min_conf):
        print(f"Testing datasets with following config: Min_supp: {min_sup}%, Min_conf: {min_conf}%")
        dictionary = {
            "dataset_name" : [],
            "accuracy [%]": [],
            "std_deviation": [],
            "execution_time": []
        }

        for dataset in datasets:
            start_time = time.time()
            accuracy, clf = k_fold_cross_validate(dataset, min_sup, min_conf)
            arr = np.array(accuracy, dtype='float32')
            end_time = time.time()
            dictionary["dataset_name"].append(dataset.name)
            dictionary["accuracy [%]"].append(round(arr.mean()*100, 2))
            dictionary["std_deviation"].append(round(arr.std()*100, 2))
            dictionary["execution_time"].append(end_time - start_time)

        datasets_df = pd.DataFrame(dictionary)
        datasets_df = datasets_df[["dataset_name", "accuracy [%]", "std_deviation", "execution_time"]]
        return datasets_df

In [8]:
def test_all_params(dataset):
    dictionary = {
            "min_supp" : [],
            "min_conf": [],
            "clf_rules_count": [],
            "accuracy": [],
            "std": []
    }
    start_time = time.time()
    step = 0
    dataset_lower_bound = math.floor(dataset.min_rule_supp)
    dataset_upper_bound = math.floor(dataset.max_rule_supp)+1
    
    progress = Progress(
        SpinnerColumn(),
        *Progress.get_default_columns()
    )
    
    with progress as pb:
        p1 = pb.add_task(f'Testing combinations for dataset: {dataset.name}', total=4*(dataset_upper_bound-dataset_lower_bound))
        for c in range(20, 81, 20):
            for s in range(dataset_lower_bound, dataset_upper_bound, 1):
                step += 1
                pb.update(task_id=p1, completed=step)
                accuracy, clf = k_fold_cross_validate(dataset, s, c)
                arr = np.array(accuracy, dtype='float32')
                dictionary["min_supp"].append(s)
                dictionary["min_conf"].append(c)
                dictionary["clf_rules_count"].append(clf.inspect()['lhs'].count())
                dictionary["accuracy"].append(round(arr.mean()*100, 2))
                dictionary["std"].append(round(arr.std()*100, 2))
    end_time = time.time()
    lapsed_time = end_time - start_time
    
    print("Execution time: {:.2f}s".format(lapsed_time))
    datasets_df = pd.DataFrame(dictionary)
    datasets_df = datasets_df[["min_supp", "min_conf", "clf_rules_count", "accuracy", "std"]]
    datasets_df.to_excel(f"Results/{dataset.name}-results.xlsx", index=True)
    return datasets_df

In [9]:
def find_best_params(datasets):
    dictionary = {
        "dataset_name" : [],
        "best_accuracy": [],
        "achieved_for": [],
        "execution_time": []
    }

    for dataset in datasets:
        start_time = time.time()
        tested_df = test_all_params(dataset)
        end_time = time.time()
        rows_with_best_acc = tested_df.loc[tested_df['accuracy'] == tested_df['accuracy'].max()]
        rows_with_best_acc.to_excel(f"Results/{dataset.name}-best-rows.xlsx", index=True)
        display(rows_with_best_acc)
        
        dictionary["dataset_name"].append(dataset.name)
        dictionary["best_accuracy"].append(tested_df['accuracy'].max())
        
        entries = []
        rows_with_best_acc.reset_index(inplace=True)
        for ind in rows_with_best_acc.index:
            entries.append(f"s:{rows_with_best_acc.iloc[ind]['min_supp']}, c:{rows_with_best_acc.iloc[ind]['min_conf']};")
        dictionary["achieved_for"].append(entries)
        
        dictionary["execution_time"].append(end_time - start_time)

    datasets_df = pd.DataFrame(dictionary)
    datasets_df = datasets_df[["dataset_name", "best_accuracy", "achieved_for", "execution_time"]]
    datasets_df.to_excel(f"Results/all-datasets-results.xlsx", index=True)
    display(datasets_df)

<h1>Experimental part</h1>

In [10]:
datasets = [
    Dataset(name="cars", path="datasets/mod_cars.csv"),
    Dataset(name="tic tac toe", path="datasets/mod_tic_tac_toe.csv"),
    Dataset(name="balance-scale", path="datasets/mod_balance_scale.csv", target="Class"),
    Dataset(name="breast cancer", path="datasets/mod_breast_cancer.csv", na_values="?", target="Class"),
    Dataset(name="nursery", path="datasets/mod_nursery.csv")
           ]

In [11]:
datasets_resume = process_data(datasets, 1, 20)
datasets_resume.to_excel('Results/datasets_resume.xlsx', index=True)

Processing datasets with following config: Min_supp: 1%, Min_conf: 20%


Unnamed: 0,dataset_name,rows_count,att_count,rules_count,min_rule_len,avg_rule_len,avg_rule_len_attr_ratio,max_rule_len,min_rule_supp,avg_rule_supp,max_rule_supp
0,cars,1728,6,1164,1,3.67,0.61,4,1.04,2.96,70.02
1,tic tac toe,958,9,8952,1,4.62,0.51,6,1.04,2.24,65.34
2,balance-scale,625,4,250,1,2.84,0.71,3,1.12,3.85,46.08
3,breast cancer,286,9,11520,1,5.31,0.59,9,1.05,2.42,70.28
4,nursery,12960,8,3507,1,3.89,0.49,5,1.0,2.22,33.33


In [12]:
#for estimating execution time per dataset
test_data(datasets, 1, 20)

Testing datasets with following config: Min_supp: 1%, Min_conf: 20%


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

<h1>Finding best params per set</h1>

In [None]:
find_best_params(datasets)