In [1]:
import sklearn
import numpy as np

In [2]:
dataset_filename = 'dataset/affinity_dataset.txt'
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [3]:
print(X[:5])

[[0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0.]
 [0. 0. 1. 0. 1.]
 [1. 1. 0. 0. 0.]
 [0. 0. 1. 1. 1.]]


In [4]:
# First, how many rows contains our permise: that a person is buying apples
num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:
        num_apple_purchases += 1
print("{0} people bought Apples".format(num_apple_purchases))

43 people bought Apples


In [5]:
from collections import defaultdict

valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)

In [7]:
for sample in X:
    for premise in range(4):
        if sample[premise] == 0:
            continue
        num_occurances[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:
                continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1

In [8]:
support = valid_rules
features = ["bread", "milk", "cheese", "apples", "bananas"]

In [9]:
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    rule = (premise, conclusion)
    confidence[rule] = valid_rules[rule] / num_occurances[premise]

In [10]:
def print_rule(premise, conclusion, support, confidence, features):
    # We get the names of the features for the premise and conclusion
    # and print out the rule in a readable format
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(
        premise_name,
        conclusion_name
    ))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))

In [11]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
 - Support: 18
 - Confidence: 0.346


## Ranking to find the best rules

找到最好的规则，即最佳的购物组合。需要对支持度字典 `support` 做排序。但字典本身是不支持排序的，因此需要使用 `itemgetter` 来辅助进行

In [12]:
from operator import itemgetter

sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

In [13]:
# 然后利用 print_rule 函数打印每个组合的支持度与置信度
for index in range(5):
    print("Rule #{0}".format(index + 1))
    premise, conclusion = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy bananas
 - Support: 27
 - Confidence: 0.628
Rule #2
Rule: If a person buys milk they will also buy bananas
 - Support: 27
 - Confidence: 0.519
Rule #3
Rule: If a person buys cheese they will also buy apples
 - Support: 22
 - Confidence: 0.564
Rule #4
Rule: If a person buys apples they will also buy cheese
 - Support: 22
 - Confidence: 0.512
Rule #5
Rule: If a person buys cheese they will also buy bananas
 - Support: 20
 - Confidence: 0.513


类似地，也可以利用置信度排名，打印出最佳组合

In [14]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

for index in range(5):
    print("Rule #{0}".format(index+1))
    premise, conclusion = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy bananas
 - Support: 27
 - Confidence: 0.628
Rule #2
Rule: If a person buys bread they will also buy bananas
 - Support: 16
 - Confidence: 0.571
Rule #3
Rule: If a person buys cheese they will also buy apples
 - Support: 22
 - Confidence: 0.564
Rule #4
Rule: If a person buys milk they will also buy bananas
 - Support: 27
 - Confidence: 0.519
Rule #5
Rule: If a person buys cheese they will also buy bananas
 - Support: 20
 - Confidence: 0.513
