In [13]:
from DataProcessing import DataProcessing
from Car import Car
from Cba import Cba
from Validation import Validation
import time
MIN_SUPPORT = 0.01
MIN_CONFIDENCE = 0.7
MIN_LIFT = 1.2

In [2]:
def compute_runtime(start_time, end_time):
    return end_time - start_time

## 原版 CBA


In [14]:
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
data = data_procesing.encoded_data
train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE)
car.generate_frequent()
car.sort_rule(1)
cba = Cba(train_data, car.rule)
cba.cover()
rules = cba.final_rules_
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'single')

Runtime:  5.449311017990112
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        63
           1       0.00      0.00      0.00        46
           2       0.77      1.00      0.87       198
           3       0.42      0.95      0.58        39

    accuracy                           0.68       346
   macro avg       0.30      0.49      0.36       346
weighted avg       0.49      0.68      0.56       346



In [6]:
for r in rules:
    r.print_ruleitem()
print("Default: ", default)

3=0 => class=2 (support=0.34370477568740954, confidence=1.0, lift=1.4059003051881993)
5=1 => class=2 (support=0.3379160636758321, confidence=1.0, lift=1.4059003051881993)
0=3 1=3 => class=2 (support=0.0658465991316932, confidence=1.0, lift=1.4059003051881993)
0=0 1=3 => class=2 (support=0.06439942112879884, confidence=1.0, lift=1.4059003051881993)
0=3 1=0 => class=2 (support=0.06150506512301013, confidence=1.0, lift=1.4059003051881993)
0=0 4=2 5=2 => class=2 (support=0.028219971056439943, confidence=1.0, lift=1.4059003051881993)
0=3 4=2 5=2 => class=2 (support=0.027496382054992764, confidence=1.0, lift=1.4059003051881993)
1=3 4=2 5=2 => class=2 (support=0.02604920405209841, confidence=1.0, lift=1.4059003051881993)
1=3 2=3 4=2 => class=2 (support=0.019536903039073805, confidence=0.9642857142857142, lift=1.3556895800029063)
0=3 2=0 4=2 => class=2 (support=0.020984081041968163, confidence=0.935483870967742, lift=1.3151970596921865)
0=3 2=2 4=1 => class=2 (support=0.019536903039073805, con

## WCBA 隨機森林


In [None]:
data_procesing.get_feature_importances()
feature_importances = data_procesing.feature_importances
start_time = time.time()
car = Car(data=train_data, min_support=MIN_SUPPORT,
          min_confidence=MIN_CONFIDENCE, weights=feature_importances)
car.generate_frequent()
car.sort_rule(3, True)

cba = Cba(train_data, car.rule)
cba.cover()
rules = cba.final_rules_
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'group')

Runtime:  1.6326134204864502
Macro F1 Score: 0.3654265873015873
Accuracy: 0.684971098265896


## APR 演算法


In [None]:
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE)
car.generate_frequent()
car.sort_rule(1)
cba = Cba(train_data, car.rule)
cba.apr_cover()
rules = cba.strong_rules
spare_rules = cba.spare_rules
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'group', spare_rules)

Runtime:  0.32816243171691895
Macro F1 Score: 0.18198529411764705
Accuracy: 0.5722543352601156


## CBA 加上 LIFT(排序使用 confidence+support+lift 做 sort)


In [None]:
MIN_LIFT = 1
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
data = data_procesing.encoded_data
train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()


car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE, MIN_LIFT)


car.generate_frequent()


car.sort_rule(2)


cba = Cba(train_data, car.rule)


cba.cover()


rules = cba.final_rules_


default = cba.default
end_time = time.time()


print("Runtime: ", compute_runtime(start_time, end_time))


test = Validation(test_data, rules, default, 'single')

Runtime:  1.430823564529419
Macro F1 Score: 0.3583944113605131
Accuracy: 0.6676300578034682


## CBA 加上隨機森林並拿掉重要性低的屬性


In [None]:
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
data_procesing.delete_low_importance()
data = data_procesing.encoded_data
train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE)
car.generate_frequent()
car.sort_rule(1)
cba = Cba(train_data, car.rule)
cba.cover()
rules = cba.final_rules_
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'single')

Runtime:  1.205981731414795
Macro F1 Score: 0.3758941344778255
Accuracy: 0.6763005780346821


## CBA 加上隨機森林並拿掉重要性低的屬性加上 lift


In [None]:
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
data_procesing.delete_low_importance()
data = data_procesing.encoded_data
train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE, MIN_LIFT)
car.generate_frequent()
car.sort_rule(2)
cba = Cba(train_data, car.rule)
cba.cover()
rules = cba.final_rules_
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'single')

Runtime:  0.6818227767944336
Macro F1 Score: 0.3886809772630984
Accuracy: 0.7312138728323699


## CBA 加上 APR 的剪枝方法


In [None]:
# data_procesing = DataProcessing("car.data")
# data_procesing.read_data()
# data_procesing.process()
# data = data_procesing.encoded_data
# train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE)
car.generate_frequent()
car.sort_rule(1)
cba = Cba(train_data, car.rule)
cba.apr_cover()
rules = cba.strong_rules
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'single')

Runtime:  0.3262200355529785
Macro F1 Score: 0.18198529411764705
Accuracy: 0.5722543352601156


## CBA 加上隨機森林並拿掉重要性低的屬性加上 lift 加上 APR 剪枝方法


In [None]:
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
data_procesing.delete_low_importance()
data = data_procesing.encoded_data
train_data, test_data = data_procesing.split_train_test_data()
start_time = time.time()
car = Car(train_data, MIN_SUPPORT, MIN_CONFIDENCE, MIN_LIFT)
car.generate_frequent()
car.sort_rule(2)
cba = Cba(train_data, car.rule)
cba.apr_cover()
rules = cba.strong_rules
default = cba.default
end_time = time.time()
print("Runtime: ", compute_runtime(start_time, end_time))
test = Validation(test_data, rules, default, 'single')

Runtime:  0.10307860374450684
Macro F1 Score: 0.18198529411764705
Accuracy: 0.5722543352601156


In [None]:
data_procesing = DataProcessing("car.data")
data_procesing.read_data()
data_procesing.process()
train_data, test_data = data_procesing.split_train_test_data()

train_data_without_class = train_data.drop(columns=['class'])

# do apriori


class Apriori:
    def __init__(self, data, min_support):
        self.data = data
        self.min_support = min_support
        self.support_count = round(data.shape[0] * min_support)
        self.frequent_one = []
        self.frequent_itemsets = []
        self.k = 1

    def getFrequentOne(self, data, min_support):
        frequent_itemsets = []
        for column in data.columns:
            itemset = data[column].value_counts()
            for index, value in itemset.items():
                if value >= self.support_count:
                    frequent_itemsets.append({column: index})
        self.frequent_one = frequent_itemsets
        self.frequent_itemsets = frequent_itemsets

    def combineItemsets(self, frequent_itemset1, frequent_itemset2):
        pass

    def getFrequentItemsets(self):
        self.getFrequentOne(self.data, self.min_support)  # 取得frequent one
        return self.frequent_itemsets


apriori = Apriori(train_data_without_class, MIN_SUPPORT)

frequent_itemsets = apriori.getFrequentItemsets()

print(frequent_itemsets)

for i in range(0, len(frequent_itemsets)):
    for j in range(i + 1, len(frequent_itemsets)):
        key1 = sorted(list(frequent_itemsets[i].keys()))
        key2 = sorted(list(frequent_itemsets[j].keys()))
        if (key1[0] != key2[0]):  # 屬性不同
            new_itemset = frequent_itemsets[i].copy()
            new_itemset.update(frequent_itemsets[j])  # 合併
            # 確認support
            support = train_data_without_class[list(new_itemset.keys())].apply(
                lambda x: all(x == list(new_itemset.values())), axis=1).sum()
            if support >= apriori.support_count and new_itemset not in apriori.frequent_itemsets:
                apriori.frequent_itemsets.append(new_itemset)

[{0: 0}, {0: 2}, {0: 1}, {0: 3}, {1: 1}, {1: 3}, {1: 2}, {1: 0}, {2: 3}, {2: 1}, {2: 2}, {2: 0}, {3: 2}, {3: 0}, {3: 1}, {4: 2}, {4: 0}, {4: 1}, {5: 2}, {5: 1}, {5: 0}]


In [None]:
len(apriori.frequent_itemsets)

48