In [1]:
from collections import defaultdict, Counter
from itertools import combinations
from operator import itemgetter
import pandas as pd

# Confidence threshold
THRESHOLD = 0.5

# Only consider rules for items appearing at least `MIN_COUNT` times.
MIN_COUNT = 5

In [2]:
class AssociationRules:
    def __init__(self, list_of_sets, threshold, min_count):
        """
        
        """
        assert isinstance(list_of_sets, list), "list_of_sets must be a list of sets"
        assert isinstance(list_of_sets[0], set), "list_of_sets must be a list of sets"
        assert isinstance(threshold, float) and threshold > 0  and threshold < 1, "threshold must be between 0 and 1"
        assert isinstance(min_count, int), "min_count must be an int"
        
        self.list_of_sets = list_of_sets
        self.threshold = threshold
        self.min_count = min_count
        
        self.pair_counts = defaultdict(int)
        self.item_counts = defaultdict(int)
        self.single_item_counter = defaultdict(int)
        
        self.rules = dict()
        self.count_single_item_appearances()
        self.find_assoc_rules()
        
        self.pairwise_confidence = { pair : self.rules[pair] for pair in self.rules.keys() \
                             if self.item_counts[pair[0]] >= self.min_count }
        
        self.pairwise_support = { (a,b) : self.pair_counts[ (a,b) ] / len(self.list_of_sets) \
                                 for (a,b) in self.pair_counts \
                                if self.pair_counts[(a,b)] >= self.min_count}
        
        self.single_item_support = { item : self.single_item_counter[item] / len(self.list_of_sets) \
                                 for item in self.single_item_counter \
                                if self.single_item_counter[item] >= self.min_count}
        
        self.pairwise_lift = { pair : self.rules[pair] / self.single_item_support[pair[1]] \
                              for pair in self.rules.keys() if self.item_counts[pair[0]] >= self.min_count }
        
        self.print_dict = { 'Confidence': self.pairwise_confidence,
                              'Support': self.pairwise_support,
                              'Lift': self.pairwise_lift }
        
        
    def count_single_item_appearances(self):
        """
        Updates a dictionary of counts for
        each individual item in a given list of itemsets.
        """
        for tup in self.list_of_sets:
            self.single_item_counter[tuple(tup)[0]] += 1
            self.single_item_counter[tuple(tup)[1]] += 1
        
    def update_pair_counts(self, itemset):
        """
        Updates a dictionary of pair counts for
        all pairs of items in a given itemset.
        """
        for a,b in combinations(itemset,2):
            self.pair_counts[(a,b)] += 1
            self.pair_counts[(b,a)] += 1
            
    def update_item_counts(self, itemset):
        """
        Updates a dictionary of item counts for
        all pairs of items in a given itemset.
        """
        for item in itemset:
            self.item_counts[item] += 1
            
    def filter_rules_by_conf(self):
        """
        Filters out pairs whose confidence is
        below the user defined threshold.
        """
        for (a,b) in self.pair_counts:
            confidence = self.pair_counts[(a,b)] / self.item_counts[a]
            if confidence >= self.threshold:
                self.rules[(a,b)] = confidence

    def find_assoc_rules(self):
        """
        Set final rules dictionary using
        pairs that appear together with
        confidence greater than or equal to
        the user defined threshold.
        """
        for itemset in self.list_of_sets:
            self.update_pair_counts(itemset)
            self.update_item_counts(itemset)
        rules = self.filter_rules_by_conf()
        return rules
    
    @staticmethod
    def gen_rule_str(a, b, val=None, val_fmt='{:.3f}', sep=" = ", prefix='conf', directed_or_undirected="=>"):
        text = "{} {} {}".format(a, directed_or_undirected, b)
        if val:
            text = "{}(".format(prefix) + text + ")"
            text += sep + val_fmt.format(val)
        return text

    def print_top_n_metric_rules(self, metric="confidence", top_n=10):
        """
        Pretty print pairwise ( n_i , n_j ) by metric
        """
        directed_or_undirected = { 'Confidence': '=>', 'Support': '<=>', 'Lift': '<=>' }
        print('\n==== Top {} Associations by {} Metric ===\n\n'.format(top_n, metric))
        ordered_rules = sorted(self.print_dict[metric].items(), key=itemgetter(1), reverse=True)
        dict_already_printed_a_b_not_gonna_print_b_a = dict()
        count_until_we_hit_top_n = 0
        for (a, b), conf_ab in ordered_rules:
            if count_until_we_hit_top_n < top_n:
                if dict_already_printed_a_b_not_gonna_print_b_a.get((a, b), True):
                    print(self.gen_rule_str(a, b, conf_ab, prefix=metric, directed_or_undirected=directed_or_undirected[metric]))
                    dict_already_printed_a_b_not_gonna_print_b_a[(b,a)] = None
                    count_until_we_hit_top_n+=1
            else:
                break

In [3]:
def main():
    df = pd.read_csv('BreadBasket_DMS.csv')
    checkout_list = defaultdict(list)
    trans = dict()
    for row in df.groupby(by='Transaction').filter(lambda x: len(set(x['Item'])) > 1)[['Transaction','Item']].itertuples():
        checkout_list[row.Transaction].append(row.Item)
        
    grocery_itemset = [set(lst) for lst in checkout_list.values()]
    arules = AssociationRules(grocery_itemset, THRESHOLD, MIN_COUNT)
    arules.print_top_n_metric_rules(metric="Support", top_n=10)
    arules.print_top_n_metric_rules(metric="Confidence", top_n=10)
    arules.print_top_n_metric_rules(metric="Lift", top_n=10)

In [4]:
if __name__ == "__main__":
    main()


==== Top 10 Associations by Support Metric ===


Support(Coffee <=> Bread) = 0.148
Support(Coffee <=> Cake) = 0.090
Support(Coffee <=> Tea) = 0.082
Support(Coffee <=> Pastry) = 0.078
Support(Coffee <=> NONE) = 0.070
Support(Coffee <=> Sandwich) = 0.063
Support(Medialuna <=> Coffee) = 0.058
Support(Hot chocolate <=> Coffee) = 0.049
Support(Bread <=> Pastry) = 0.048
Support(Coffee <=> Cookies) = 0.046

==== Top 10 Associations by Confidence Metric ===


Confidence(Postcard => Tshirt) = 0.857
Confidence(Extra Salami or Feta => Coffee) = 0.838
Confidence(Mighty Protein => Coffee) = 0.818
Confidence(Keeping It Local => Coffee) = 0.810
Confidence(Basket => Coffee) = 0.800
Confidence(Nomad bag => Bread) = 0.750
Confidence(Duck egg => NONE) = 0.750
Confidence(Toast => Coffee) = 0.737
Confidence(Tartine => Coffee) = 0.690
Confidence(Art Tray => Coffee) = 0.684

==== Top 10 Associations by Lift Metric ===


Lift(Postcard <=> Tshirt) = 548.286
Lift(Duck egg <=> Spanish Brunch) = 33.471
Lift(Duck