In [6]:
from joblib import Parallel, delayed
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import KFold

from CBA import CBA
from CBAWL import CBAWL
from DataHandler import DataHandler
from RWCBA import RWCBA
from APR import APR

dataset_id = 19
min_sup = 0.1
min_conf = 0.7
min_lift = 1

# CBAWL


In [10]:
def perform_cross_validation(train_index, test_index, X, y, cbawl, average='macro'):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_te = cbawl.dataHandler.oneHotEncoding(X_train)
    frequent_itemsets = cbawl.ruleGenerator.getFrequentItemsets(X_train_te)
    strong_rules, weak_rules, default_class = cbawl.model(frequent_itemsets, X_train)
    y_pred = cbawl.predict(strong_rules, weak_rules, default_class, X_test)

    precision = precision_score(y_test, y_pred, average=average, zero_division=1)
    recall = recall_score(y_test, y_pred, average=average, zero_division=1)
    f1 = f1_score(y_test, y_pred, average=average, zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    return precision, recall, f1, accuracy


cbawl = CBAWL(dataset_id, min_sup, min_conf, min_lift)
data = cbawl.dataHandler.loadData()
features = cbawl.dataHandler.oneHotEncoding(data)
features_importance = cbawl.dataHandler.getFeaturesImportance(data)
features = cbawl.dataHandler.delLowImportanceFeatures(data, features_importance)
X = features
y = data.iloc[:, -1]

kf = KFold(n_splits=10, shuffle=True, random_state=42)


macro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation)(train_index, test_index, X, y, cbawl, 'macro')
    for train_index, test_index in kf.split(X)
)


# 將每個指標分開取出並計算平均值
macro_precision = [result[0] for result in macro_results]
macro_recall = [result[1] for result in macro_results]
macro_f1 = [result[2] for result in macro_results]
macro_accuracy = [result[3] for result in macro_results]
print('macro')
print('Precision:', sum(macro_precision)/len(macro_precision))
print('Recall:', sum(macro_recall)/len(macro_recall))
print('F1:', sum(macro_f1)/len(macro_f1))
print('Accuracy:', sum(macro_accuracy)/len(macro_accuracy))

micro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation)(train_index, test_index, X, y, cbawl, 'micro')
    for train_index, test_index in kf.split(X)
)

micro_precision = [result[0] for result in micro_results]
micro_recall = [result[1] for result in micro_results]
micro_f1 = [result[2] for result in micro_results]
micro_accuracy = [result[3] for result in micro_results]
print('micro')
print('Precision:', sum(micro_precision)/len(micro_precision))
print('Recall:', sum(micro_recall)/len(micro_recall))
print('F1:', sum(micro_f1)/len(micro_f1))
print('Accuracy:', sum(micro_accuracy)/len(micro_accuracy))

weighted_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation)(train_index, test_index, X, y, cbawl, 'weighted')
    for train_index, test_index in kf.split(X)
)

weighted_precision = [result[0] for result in weighted_results]
weighted_recall = [result[1] for result in weighted_results]
weighted_f1 = [result[2] for result in weighted_results]
weighted_accuracy = [result[3] for result in weighted_results]
print('weighted')
print('Precision:', sum(weighted_precision)/len(weighted_precision))
print('Recall:', sum(weighted_recall)/len(weighted_recall))
print('F1:', sum(weighted_f1)/len(weighted_f1))
print('Accuracy:', sum(weighted_accuracy)/len(weighted_accuracy))

macro
Precision: 0.80587439094434
Recall: 0.5448368931560523
F1: 0.5166779721188981
Accuracy: 0.8402473450732625
micro
Precision: 0.8402473450732625
Recall: 0.8402473450732625
F1: 0.8402473450732625
Accuracy: 0.8402473450732625
weighted
Precision: 0.8756574704247999
Recall: 0.8402473450732625
F1: 0.813385800293527
Accuracy: 0.8402473450732625


# CBA


In [9]:
def perform_cross_validation_cba(train_index, test_index, average='macro'):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    cba = CBA(train, min_sup, min_conf)
    ruleitemset = cba.apriori()
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['confidence'], x['support'], len(x['condition'])), reverse=True)
    rules, default_class = cba.prune(sorted_ruleitemset)
    predict_y = cba.predict(test, rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average=average, zero_division=1)
    recall = recall_score(test_y, predict_y, average=average, zero_division=1)
    f1 = f1_score(test_y, predict_y, average=average, zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy, rules, default_class


dataHandler = DataHandler(dataset_id)

data = dataHandler.loadData()

X = data.drop('class', axis=1)
y = data['class']

kf = KFold(n_splits=10, shuffle=True, random_state=42)

macro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_cba)(train_index, test_index, average='macro')
    for train_index, test_index in kf.split(X)
)

rules = [result[4] for result in macro_results]
default_classes = [result[5] for result in macro_results]

print('rules')
for rule in rules[0]:
    print(rule)

# micro_results = Parallel(n_jobs=-1)(
#     delayed(perform_cross_validation_cba)(train_index, test_index, average='micro')
#     for train_index, test_index in kf.split(X)
# )

# weighted_results = Parallel(n_jobs=-1)(
#     delayed(perform_cross_validation_cba)(train_index, test_index, average='weighted')
#     for train_index, test_index in kf.split(X)
# )

# macro_precision = [result[0] for result in macro_results]
# macro_recall = [result[1] for result in macro_results]
# macro_f1 = [result[2] for result in macro_results]
# macro_accuracy = [result[3] for result in macro_results]

# micro_precision = [result[0] for result in micro_results]
# micro_recall = [result[1] for result in micro_results]
# micro_f1 = [result[2] for result in micro_results]
# micro_accuracy = [result[3] for result in micro_results]

# weighted_precision = [result[0] for result in weighted_results]
# weighted_recall = [result[1] for result in weighted_results]
# weighted_f1 = [result[2] for result in weighted_results]
# weighted_accuracy = [result[3] for result in weighted_results]

# print('macro')
# print('Precision:', sum(macro_precision)/len(macro_precision))
# print('Recall:', sum(macro_recall)/len(macro_recall))
# print('F1:', sum(macro_f1)/len(macro_f1))
# print('Accuracy:', sum(macro_accuracy)/len(macro_accuracy))

# print('micro')
# print('Precision:', sum(micro_precision)/len(micro_precision))
# print('Recall:', sum(micro_recall)/len(micro_recall))
# print('F1:', sum(micro_f1)/len(micro_f1))
# print('Accuracy:', sum(micro_accuracy)/len(micro_accuracy))

# print('weighted')
# print('Precision:', sum(weighted_precision)/len(weighted_precision))
# print('Recall:', sum(weighted_recall)/len(weighted_recall))
# print('F1:', sum(weighted_f1)/len(weighted_f1))
# print('Accuracy:', sum(weighted_accuracy)/len(weighted_accuracy))

rules
{'condition': [{'safety': 'low'}], 'class': 'unacc', 'support': 0.33762057877813506, 'confidence': 1.0, 'error': 457, 'default_class': 'unacc'}
{'condition': [{'persons': '2'}], 'class': 'unacc', 'support': 0.3279742765273312, 'confidence': 1.0, 'error': 349, 'default_class': 'acc'}
{'condition': [{'maint': 'vhigh'}], 'class': 'unacc', 'support': 0.2077170418006431, 'confidence': 0.8389610389610389, 'error': 303, 'default_class': 'acc'}


# RWCBA


In [4]:
min_sup += 0.1


def perform_cross_validation_rwcba(train_index, test_index, data, importance, average='macro'):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    rwcba = RWCBA(train, importance)
    ruleitemset = rwcba.apriori(min_sup, min_conf)
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['hm'], x['confidence'], x['support'], len(x['condition'])), reverse=True)
    strong_rules, spare_rules, default_class = rwcba.prune(sorted_ruleitemset)
    predict_y = rwcba.predict(test, strong_rules, spare_rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average=average, zero_division=1)
    recall = recall_score(test_y, predict_y, average=average, zero_division=1)
    f1 = f1_score(test_y, predict_y, average=average, zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy


dataHandler = DataHandler(dataset_id)

data = dataHandler.loadData()
importance = dataHandler.getFeaturesImportance(data)

for i in importance:
    importance[i] = 1 + importance[i] * len(importance)

X = data.drop('class', axis=1)
y = data['class']

kf = KFold(n_splits=10, shuffle=True, random_state=42)

macro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_rwcba)(train_index, test_index, data, importance)
    for train_index, test_index in kf.split(X)
)

micro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_rwcba)(train_index, test_index, data, importance, 'micro')
    for train_index, test_index in kf.split(X)
)

weighted_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_rwcba)(train_index, test_index, data, importance, 'weighted')
    for train_index, test_index in kf.split(X)
)

macro_precision = [result[0] for result in macro_results]
macro_recall = [result[1] for result in macro_results]
macro_f1 = [result[2] for result in macro_results]
macro_accuracy = [result[3] for result in macro_results]

micro_precision = [result[0] for result in micro_results]
micro_recall = [result[1] for result in micro_results]
micro_f1 = [result[2] for result in micro_results]
micro_accuracy = [result[3] for result in micro_results]

weighted_precision = [result[0] for result in weighted_results]
weighted_recall = [result[1] for result in weighted_results]
weighted_f1 = [result[2] for result in weighted_results]
weighted_accuracy = [result[3] for result in weighted_results]

print('macro')
print('Precision:', sum(macro_precision)/len(macro_precision))
print('Recall:', sum(macro_recall)/len(macro_recall))
print('F1:', sum(macro_f1)/len(macro_f1))
print('Accuracy:', sum(macro_accuracy)/len(macro_accuracy))

print('micro')
print('Precision:', sum(micro_precision)/len(micro_precision))
print('Recall:', sum(micro_recall)/len(micro_recall))
print('F1:', sum(micro_f1)/len(micro_f1))
print('Accuracy:', sum(micro_accuracy)/len(micro_accuracy))

print('weighted')
print('Precision:', sum(weighted_precision)/len(weighted_precision))
print('Recall:', sum(weighted_recall)/len(weighted_recall))
print('F1:', sum(weighted_f1)/len(weighted_f1))
print('Accuracy:', sum(weighted_accuracy)/len(weighted_accuracy))

macro
Precision: 0.822442203130435
Recall: 0.526420008386103
F1: 0.5029725744514971
Accuracy: 0.7279313876088069
micro
Precision: 0.727931387608807
Recall: 0.727931387608807
F1: 0.727931387608807
Accuracy: 0.7279313876088069
weighted
Precision: 0.7604436721154666
Recall: 0.727931387608807
F1: 0.698295133393283
Accuracy: 0.7279313876088069


# APR


In [5]:
def perform_cross_validation_apr(train_index, test_index, data, average='macro'):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    apr = APR(train, min_sup, min_conf)
    ruleitemset = apr.apriori()
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['confidence'], x['support'], len(x['condition'])), reverse=True)
    rules, default_class = apr.prune(sorted_ruleitemset)
    predict_y = apr.predict(test, rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average=average, zero_division=1)
    recall = recall_score(test_y, predict_y, average=average, zero_division=1)
    f1 = f1_score(test_y, predict_y, average=average, zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy


dataHandler = DataHandler(dataset_id)
data = dataHandler.loadData()

kf = KFold(n_splits=2, shuffle=True, random_state=42)

macro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_apr)(train_index, test_index, data)
    for train_index, test_index in kf.split(data)
)

micro_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_apr)(train_index, test_index, data, 'micro')
    for train_index, test_index in kf.split(data)
)

weighted_results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_apr)(train_index, test_index, data, 'weighted')
    for train_index, test_index in kf.split(data)
)

macro_precision = [result[0] for result in macro_results]
macro_recall = [result[1] for result in macro_results]
macro_f1 = [result[2] for result in macro_results]
macro_accuracy = [result[3] for result in macro_results]

micro_precision = [result[0] for result in micro_results]
micro_recall = [result[1] for result in micro_results]
micro_f1 = [result[2] for result in micro_results]
micro_accuracy = [result[3] for result in micro_results]

weighted_precision = [result[0] for result in weighted_results]
weighted_recall = [result[1] for result in weighted_results]
weighted_f1 = [result[2] for result in weighted_results]
weighted_accuracy = [result[3] for result in weighted_results]

print('macro')
print('Precision:', sum(macro_precision)/len(macro_precision))
print('Recall:', sum(macro_recall)/len(macro_recall))
print('F1:', sum(macro_f1)/len(macro_f1))
print('Accuracy:', sum(macro_accuracy)/len(macro_accuracy))

print('micro')
print('Precision:', sum(micro_precision)/len(micro_precision))
print('Recall:', sum(micro_recall)/len(micro_recall))
print('F1:', sum(micro_f1)/len(micro_f1))
print('Accuracy:', sum(micro_accuracy)/len(micro_accuracy))

print('weighted')
print('Precision:', sum(weighted_precision)/len(weighted_precision))
print('Recall:', sum(weighted_recall)/len(weighted_recall))
print('F1:', sum(weighted_f1)/len(weighted_f1))
print('Accuracy:', sum(weighted_accuracy)/len(weighted_accuracy))

macro
Precision: 0.8144012315338195
Recall: 0.3333333333333333
F1: 0.204730229120473
Accuracy: 0.4432036946014582
micro
Precision: 0.4432036946014582
Recall: 0.4432036946014582
F1: 0.4432036946014582
Accuracy: 0.4432036946014582
weighted
Precision: 0.753231152369399
Recall: 0.4432036946014582
F1: 0.27221670184149727
Accuracy: 0.4432036946014582
