In [42]:
import pandas as pd
import itertools
import ast

In [4]:
def findFrequentItemsets(transactions, itemsets, minSupport):
    frequentItemsets = {}
    for itemset in itemsets.keys():
            # counts the number of times that the itemset can be found in the transactions array
            for transaction in transactions:
                if isinstance(itemset, str):
                    if itemset in set(transaction):
                        itemsets[itemset] += 1
                elif set(itemset).issubset(set(transaction)):
                    itemsets[itemset] += 1
            # check to see if it is frequent or not. if it is then add the itemset to the frequent sets array
            support = itemsets[itemset]/len(transactions)
            if support >= minSupport:
                frequentItemsets[itemset] = support
    return frequentItemsets

In [5]:
class AssociationRule:
    def __init__(self, a, b, confidence):
        self.a = a
        self.b = b
        self.confidence = confidence
    def __str__(self):
        retstr = ''
        if len(self.a)==1:
            retstr+=self.a[0]
        else:
            retstr+=str(self.a)
        retstr += " implies "
        if len(self.b)==1:
            retstr+=self.b[0]
        else:
            retstr+=str(self.b)
        retstr+=f' with {self.confidence} confidence'
        return retstr

In [39]:
def generateAssociationRules(frequentItemsets, minConfidence):
    associationRules = set()
    for itemSet in frequentItemsets.keys():
        if not isinstance(itemSet, str):
            sets = []
            for i in range(1, len(itemSet)):
                sets.extend(list(itertools.combinations(itemSet, i)))
            for s in sets:
                if len(s)==1:
                    s=s[0]
                confidence = frequentItemsets[itemSet]/frequentItemsets[s]
                if confidence >= minConfidence:
                    associationRules.add(AssociationRule(s, tuple(set(itemSet)-set(list(s))), confidence))
    return associationRules

In [7]:
# transactions should be a 2 dimensional array of all the transactions
# min support and min confidence should be as a decimal, not a percentage
def bruteForce(transactions, minSupport, minConfidence):
    frequentSets = {}
    items = set()
    for transaction in transactions:
        for item in transaction:
            items.add(item)
    k = 1
    itemsets = {item: 0 for item in items}
    while True:
        f = findFrequentItemsets(transactions, itemsets, minSupport)
        if len(f.keys()) == 0:
            break
        frequentSets.update(f)
        k+=1
        itemsets = {item: 0 for item in set(itertools.combinations(items, k))}
    return generateAssociationRules(frequentSets, minConfidence)
    # do the confidence equations for all the combinations and return the association rules

In [44]:
file_path = './DicksSportingGoods.csv'
df = pd.read_csv(file_path)
transactions = df['Items Bought'].apply(ast.literal_eval).tolist()
for rule in bruteForce(transactions, 0.05, 0.25):
    print(rule)

('Running Shoes', 'Electrolyte Gels') implies Protein Powder with 0.25 confidence
Running Shoes implies ('Running Shoes', 'Electrolyte Gels') with 0.8 confidence
Swim Cap implies ('Swim Cap', 'Gatorade Bottle') with 0.5 confidence
Swim Goggles implies ('Swim Cap', 'Gatorade Bottle', 'Swim Goggles') with 0.25 confidence
Swim Cap implies ('Swim Cap', 'Swim Goggles') with 0.7499999999999999 confidence
Basketball implies ('Basketball Shoes', 'Basketball') with 0.5 confidence
Basketball Shoes implies ('Basketball Shoes', 'Basketball') with 0.6666666666666667 confidence
('Basketball Shoes', 'Basketball') implies Gatorade Bottle with 0.5 confidence
Electrolyte Gels implies ('Running Shoes', 'Electrolyte Gels') with 0.8 confidence
Basketball implies ('Gatorade Bottle', 'Basketball') with 0.5 confidence
Running Shoes implies ('Gatorade Bottle', 'Running Shoes') with 0.4 confidence
('Protein Powder', 'Electrolyte Gels') implies Running Shoes with 0.5 confidence
Basketball Shoes implies ('Basketb