In [1]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split

import pandas as pd

# 提取 Car Evaluation 數據集
car_evaluation = fetch_ucirepo(id=19)

# 提取特徵和目標
features = car_evaluation.data.features
target = car_evaluation.data.targets

# 轉換為 pandas DataFrame
df = pd.DataFrame(features, columns=car_evaluation.feature_names)
df['target'] = target

X = df.drop('target', axis=1)
y = df['target']

# 分割數據集並重構index
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# 合併
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [4]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

# 將特徵轉換為所需格式，排除 'target' 列
features_list = []
for index, row in X_train.iterrows():
    features = []
    for col, value in row.items():
        features.append(f"{col}: {value}")
    features_list.append(features)
# 使用 TransactionEncoder 進行編碼
print(features_list)
te = TransactionEncoder()
te_ary = te.fit(features_list).transform(features_list)
# 將編碼後的數據轉換為 DataFrame
te_df = pd.DataFrame(te_ary, columns=te.columns_)

# 使用 apriori 算法找出頻繁項集
frequent_itemsets = apriori(te_df, min_support=0.01, use_colnames=True)

[['buying: vhigh', 'maint: vhigh', 'doors: 5more', 'persons: more', 'lug_boot: big', 'safety: high'], ['buying: med', 'maint: vhigh', 'doors: 3', 'persons: 4', 'lug_boot: small', 'safety: med'], ['buying: low', 'maint: low', 'doors: 5more', 'persons: 2', 'lug_boot: big', 'safety: high'], ['buying: high', 'maint: med', 'doors: 4', 'persons: 2', 'lug_boot: med', 'safety: med'], ['buying: high', 'maint: med', 'doors: 3', 'persons: 2', 'lug_boot: med', 'safety: low'], ['buying: high', 'maint: med', 'doors: 4', 'persons: more', 'lug_boot: small', 'safety: high'], ['buying: vhigh', 'maint: low', 'doors: 5more', 'persons: more', 'lug_boot: big', 'safety: low'], ['buying: high', 'maint: med', 'doors: 3', 'persons: 2', 'lug_boot: med', 'safety: med'], ['buying: med', 'maint: vhigh', 'doors: 3', 'persons: more', 'lug_boot: big', 'safety: high'], ['buying: med', 'maint: low', 'doors: 3', 'persons: 4', 'lug_boot: big', 'safety: low'], ['buying: vhigh', 'maint: high', 'doors: 2', 'persons: more', '

In [5]:
def formatFrequentItemsets(frequent_itemsets):
    formatted_itemsets = []
    for index, row in frequent_itemsets.iterrows():
        itemset = []
        for i, item in enumerate(row['itemsets']):
            key = item.split(': ')[0]
            value = item.split(': ')[1]
            itemset.append({key: value})
        formatted_itemsets.append(itemset)
    return formatted_itemsets


format_frequent_itemsets = formatFrequentItemsets(frequent_itemsets)
print(format_frequent_itemsets)

[[{'buying': 'high'}], [{'buying': 'low'}], [{'buying': 'med'}], [{'buying': 'vhigh'}], [{'doors': '2'}], [{'doors': '3'}], [{'doors': '4'}], [{'doors': '5more'}], [{'lug_boot': 'big'}], [{'lug_boot': 'med'}], [{'lug_boot': 'small'}], [{'maint': 'high'}], [{'maint': 'low'}], [{'maint': 'med'}], [{'maint': 'vhigh'}], [{'persons': '2'}], [{'persons': '4'}], [{'persons': 'more'}], [{'safety': 'high'}], [{'safety': 'low'}], [{'safety': 'med'}], [{'buying': 'high'}, {'doors': '2'}], [{'buying': 'high'}, {'doors': '3'}], [{'buying': 'high'}, {'doors': '4'}], [{'doors': '5more'}, {'buying': 'high'}], [{'lug_boot': 'big'}, {'buying': 'high'}], [{'buying': 'high'}, {'lug_boot': 'med'}], [{'buying': 'high'}, {'lug_boot': 'small'}], [{'buying': 'high'}, {'maint': 'high'}], [{'buying': 'high'}, {'maint': 'low'}], [{'buying': 'high'}, {'maint': 'med'}], [{'maint': 'vhigh'}, {'buying': 'high'}], [{'persons': '2'}, {'buying': 'high'}], [{'buying': 'high'}, {'persons': '4'}], [{'buying': 'high'}, {'pe

In [30]:
# 1. 特徵的頻繁項集與原始資料集做配對
# 2. 將包含該項集的樣本取其類別與項集做配對
# 3. 形成特徵->類別的形式的規則 {feature: (itemset), target: (class), support: (support), confidence: (confidence)}


# 定義函數來生成規則
# target_with_count = df['target'].value_counts()


def generate_rule(dataset, frequent_itemset):
    targets = dataset['target'].unique()
    rule = None
    max_harmonic_mean = 0
    for target in targets:
        candidate_rule = {'features': frequent_itemset, 'target': target}
        support, confidence, lift = computeSupportAndConfidenceAndLift(
            dataset, candidate_rule)
        if (confidence >= 0.5 and lift >= 1):
            # harmonic_mean = 2 / (1/confidence + 1/lift)
            harmonic_mean = confidence
            if harmonic_mean > max_harmonic_mean:
                max_harmonic_mean = harmonic_mean
                rule = candidate_rule
                rule['support'] = support
                rule['confidence'] = confidence
                rule['lift'] = lift
                rule['hm'] = harmonic_mean
    return rule


def check_itemset_in_instance(itemset, instance):
    for item in itemset:
        item_keys = item.keys()
        for item_key in item_keys:
            if item_key not in instance:
                return False
            if item_key in instance and instance[item_key] != item[item_key]:
                return False
    return True


def computeSupportAndConfidenceAndLift(df, rule):
    features = rule['features']
    target = rule['target']

    condition = df['target'] == target
    condition_without_target = None
    for item in features:
        key = list(item.keys())[0]
        value = list(item.values())[0]
        condition = condition & (df[key] == value)
        condition_without_target = condition_without_target & (
            df[key] == value) if condition_without_target is not None else (df[key] == value)
    if df[condition_without_target].shape[0] == 0:
        return 0, 0, 0
    support = df[condition].shape[0] / df.shape[0]
    confidence = df[condition].shape[0] / df[condition_without_target].shape[0]
    lift = confidence / (df[df['target'] == target].shape[0] / df.shape[0])
    return support, confidence, lift


rules = []

for i in range(len(format_frequent_itemsets)):
    # print(f"Processing {i+1}/{len(format_frequent_itemsets)}")
    frequent_itemset = format_frequent_itemsets[i]
    new_rule = generate_rule(train, frequent_itemset)
    if new_rule:
        rules.append(new_rule)

rules = sorted(rules, key=lambda x: x['hm'], reverse=True)

In [31]:
# databset cover

def deataset_cover(rules, df):
    now_df = df
    strong_rules = []
    weak_rules = []
    for rule in rules:
        if now_df.shape[0] == 0:
            weak_rules.append(rule)
            continue
        cover_df = cover(rule, now_df)
        if cover_df is not None:
            now_df.drop(cover_df.index, inplace=True)
            strong_rules.append(rule)
        else:
            weak_rules.append(rule)

    default_class = None

    if now_df.shape[0] != 0:
        default_class = now_df['target'].value_counts().idxmax()
    else:
        # strong_rules裡面最多的target
        if len(strong_rules) != 0:
            default_class = max(strong_rules, key=lambda x: x['target'])
        else:
            default_class = max(weak_rules, key=lambda x: x['target'])

    if default_class is None:
        default_class = df['target'].value_counts().idxmax()

    return strong_rules, weak_rules, default_class


def cover(rule, df):
    features = rule['features']
    target = rule['target']
    condition = df['target'] == target
    for item in features:
        key = list(item.keys())[0]
        value = list(item.values())[0]
        condition = condition & (df[key] == value)
    if df[condition].shape[0] == 0:
        return None
    return df[condition]

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
train_ = train.copy()
strong_rules, weak_rules, default_class = deataset_cover(rules, train_)

# print("strong_rules:")
# for rule in strong_rules:
#     print(rule)

# print("weak_rules:")
# for rule in weak_rules:
#     print(rule)

# print("default_class:", default_class)


def verify(test_data, strong_rules, weak_rules, default_class):
    print("test_data.shape[0]:", test_data.shape[0])
    results = []
    for i in range(test_data.shape[0]):
        check = False
        instance = test_data.loc[i, :]
        for rule in strong_rules:
            if check_itemset_in_instance(rule['features'], instance):
                results.append(rule['target'])
                check = True
                break

        if not check:
            for rule in weak_rules:
                if check_itemset_in_instance(rule['features'], instance):
                    results.append(rule['target'])
                    check = True
                    break

        if not check:
            results.append(default_class)

    return results


results = verify(test, strong_rules, weak_rules, default_class)


classification_report_ = classification_report(y_test, results)

print(classification_report_)

test_data.shape[0]: 346
              precision    recall  f1-score   support

         acc       0.72      0.77      0.74        83
        good       0.00      0.00      0.00        11
       unacc       0.92      1.00      0.96       235
       vgood       1.00      0.12      0.21        17

    accuracy                           0.87       346
   macro avg       0.66      0.47      0.48       346
weighted avg       0.85      0.87      0.84       346



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
print((1+0.78+0.32)/3)

0.7000000000000001
