In [50]:
from joblib import Parallel, delayed
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import KFold

from CBA import CBA
from CBAWL import CBAWL
from DataHandler import DataHandler
from RWCBA import RWCBA
from APR import APR

dataset_id = 105
min_sup = 0.2
min_conf = 0.7
min_lift = 1

# CBAWL

In [51]:
def perform_cross_validation(train_index, test_index, X, y, cbawl):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_te = cbawl.dataHandler.oneHotEncoding(X_train)
    frequent_itemsets = cbawl.ruleGenerator.getFrequentItemsets(X_train_te)
    strong_rules, weak_rules, default_class = cbawl.model(frequent_itemsets, X_train)
    y_pred = cbawl.predict(strong_rules, weak_rules, default_class, X_test)

    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)
    accuracy = accuracy_score(y_test, y_pred)

    return precision, recall, f1, accuracy


cbawl = CBAWL(dataset_id, min_sup, min_conf, min_lift)
data = cbawl.dataHandler.loadData()
features = cbawl.dataHandler.oneHotEncoding(data)
features_importance = cbawl.dataHandler.getFeaturesImportance(data)
features = cbawl.dataHandler.delLowImportanceFeatures(data, features_importance)
X = features
y = data.iloc[:, -1]

kf = KFold(n_splits=10, shuffle=True, random_state=42)


results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation)(train_index, test_index, X, y, cbawl)
    for train_index, test_index in kf.split(X)
)

# 將每個指標分開取出並計算平均值
precision = [result[0] for result in results]
recall = [result[1] for result in results]
f1 = [result[2] for result in results]
accuracy = [result[3] for result in results]

print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('F1:', sum(f1)/len(f1))
print('Accuracy:', sum(accuracy)/len(accuracy))

Precision: 0.871785326106567
Recall: 0.8690615790658892
F1: 0.8653339275064088
Accuracy: 0.8736257928118393


# CBA

In [52]:
def perform_cross_validation_cba(train_index, test_index, X, y):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    cba = CBA(train, min_sup, min_conf)
    ruleitemset = cba.apriori()
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['confidence'], x['support'], len(x['condition'])), reverse=True)
    rules, default_class = cba.prune(sorted_ruleitemset)
    predict_y = cba.predict(test, rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average='macro', zero_division=1)
    recall = recall_score(test_y, predict_y, average='macro', zero_division=1)
    f1 = f1_score(test_y, predict_y, average='macro', zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy


dataHandler = DataHandler(dataset_id)

data = dataHandler.loadData()

X = data.drop('class', axis=1)
y = data['class']

kf = KFold(n_splits=10, shuffle=True, random_state=42)

results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_cba)(train_index, test_index, X, y)
    for train_index, test_index in kf.split(X)
)

precision = [result[0] for result in results]
recall = [result[1] for result in results]
f1 = [result[2] for result in results]
accuracy = [result[3] for result in results]

print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('F1:', sum(f1)/len(f1))
print('Accuracy:', sum(accuracy)/len(accuracy))

# RWCBA

In [48]:

def perform_cross_validation_rwcba(train_index, test_index, data, importance):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    rwcba = RWCBA(train, importance)
    ruleitemset = rwcba.apriori(min_sup, min_conf)
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['hm'], x['confidence'], x['support'], len(x['condition'])), reverse=True)
    strong_rules, spare_rules, default_class = rwcba.prune(sorted_ruleitemset)
    predict_y = rwcba.predict(test, strong_rules, spare_rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average='macro', zero_division=1)
    recall = recall_score(test_y, predict_y, average='macro', zero_division=1)
    f1 = f1_score(test_y, predict_y, average='macro', zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy


dataHandler = DataHandler(dataset_id)

data = dataHandler.loadData()
importance = dataHandler.getFeaturesImportance(data)

for i in importance:
    importance[i] = 1 + importance[i] * len(importance)

X = data.drop('class', axis=1)
y = data['class']

kf = KFold(n_splits=10, shuffle=True, random_state=42)

results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_rwcba)(train_index, test_index, data, importance)
    for train_index, test_index in kf.split(X)
)

precision = [result[0] for result in results]
recall = [result[1] for result in results]
f1 = [result[2] for result in results]
accuracy = [result[3] for result in results]

print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('F1:', sum(f1)/len(f1))
print('Accuracy:', sum(accuracy)/len(accuracy))

Precision: 0.8214870826055037
Recall: 0.5252972561880821
F1: 0.5019110122657211
Accuracy: 0.7263184843830005


# APR

In [49]:
def perform_cross_validation_apr(train_index, test_index, data):
    train = data.iloc[train_index].reset_index(drop=True)
    test = data.iloc[test_index].reset_index(drop=True)
    apr = APR(train, min_sup, min_conf)
    ruleitemset = apr.apriori()
    sorted_ruleitemset = sorted(ruleitemset, key=lambda x: (
        x['confidence'], x['support'], len(x['condition'])), reverse=True)
    rules, default_class = apr.prune(sorted_ruleitemset)
    predict_y = apr.predict(test, rules, default_class)
    test_y = test['class']
    precision = precision_score(test_y, predict_y, average='macro', zero_division=1)
    recall = recall_score(test_y, predict_y, average='macro', zero_division=1)
    f1 = f1_score(test_y, predict_y, average='macro', zero_division=1)
    accuracy = accuracy_score(test_y, predict_y)
    return precision, recall, f1, accuracy


dataHandler = DataHandler(dataset_id)
data = dataHandler.loadData()

kf = KFold(n_splits=2, shuffle=True, random_state=42)

results = Parallel(n_jobs=-1)(
    delayed(perform_cross_validation_apr)(train_index, test_index, data)
    for train_index, test_index in kf.split(data)
)

precision = [result[0] for result in results]
recall = [result[1] for result in results]
f1 = [result[2] for result in results]
accuracy = [result[3] for result in results]

print('Precision:', sum(precision)/len(precision))
print('Recall:', sum(recall)/len(recall))
print('F1:', sum(f1)/len(f1))
print('Accuracy:', sum(accuracy)/len(accuracy))

Precision: 0.8085552740643726
Recall: 0.40654676258992806
F1: 0.3338098525449767
Accuracy: 0.5681473744572786
