In [65]:
import pandas as pd

def getReceiptMbs():
    df = pd.read_csv("data/1000i.csv",names = ['recpt_id','quantity','item'])
    
    # mbs: market baskets; maps the receipt number to a set of all the items purchased
    mbs = {}
    for row in df.values:
        item_id = row[2]
        r_id = row[0]

        if(r_id not in mbs):
            mbs[r_id] = set()

        mbs[r_id].add(item_id)
    
    return mbs

In [66]:
def getItemSets():
    df = pd.read_csv("data/1000i.csv",names = ['recpt_id','quantity','item'])
    
    return set(df['item'])

In [67]:
# Returns support of itemset
# Checks how many marketbaskets contain the itemset
def getSupport(itemset,mbs):
    count = 0
    for mb in mbs:
        if (itemset.issubset(mb)):
            count+=1
    return count/len(mbs)

In [68]:
"""
mbs: marketbaskets; map of receipt number to market basket
itemset: set of all items
minSup: minimum support number

return:
"""
def apriori(mbs, itemset, minSup):
    F = [] # list of F1, F2, ..., Fn
    F1 = [] # list of all item sets of length 1 where the support of the item set > minSup
    
    for item in itemset:            
        itemSup = getSupport(set([item]), mbs.values())
        if(itemSup >= minSup):
             F1.append(set([item]))
                
    F.append(F1)
    
    k = 1 #index to iterate F, eg. F[0] == F1
    while(len(F[k-1]) > 0):
        Ck = candidateGen(F[k-1], k-1) # candidate frequent itemsets of length k+1
        Fk = []
        
        for candidate in Ck:
            count = 0
            for mb in mbs.values():
                if(candidate.issubset(mb)):
                    count += 1

            if(count/len(mbs) >= minSup):
                Fk.append(candidate)
        
        F.append(Fk)   
        k += 1
                    
    return F

In [69]:
# Passing in arrray of itemsets of length k
# the size/length of the item sets K
# return: set of candidate frequent item sets of length k+1
def candidateGen(Fk, k):
    candidates = set()
    finalCandidates = set()
    
    #generate candidates of length k+1
    for itemset1 in Fk:
        for itemset2 in Fk:
            # check len(set) == k?
            union = itemset1.union(itemset2)
            if( (itemset1 is not itemset2) and (len(union) == len(itemset1) + 1) ):
                candidates.add(frozenset(union))
    
    #prune candidates
    for cand in candidates:
        isValid = True
        for item in cand:
            prunedCand = set([c for c in cand if c != item])
            if (prunedCand not in Fk):
                isValid = False
                continue;
        if (isValid):
            finalCandidates.add(cand)
            
    return finalCandidates 

In [70]:
def main():
    mbs = getReceiptMbs()
    itemsets = getItemSets()
    print(apriori(mbs,itemsets,.03))

In [71]:
main()

[[{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}, {30}, {31}, {32}, {33}, {34}, {35}, {36}, {37}, {39}, {40}, {41}, {42}, {43}, {44}, {45}, {46}, {47}, {48}, {49}], [frozenset({5, 22}), frozenset({48, 31}), frozenset({33, 42}), frozenset({36, 31}), frozenset({16, 45}), frozenset({11, 7}), frozenset({27, 28}), frozenset({9, 4}), frozenset({2, 46}), frozenset({12, 31}), frozenset({37, 7}), frozenset({44, 14}), frozenset({24, 23}), frozenset({1, 19}), frozenset({0, 2}), frozenset({18, 3}), frozenset({48, 12}), frozenset({18, 35}), frozenset({32, 45}), frozenset({40, 23}), frozenset({12, 36}), frozenset({24, 41}), frozenset({16, 32}), frozenset({35, 3}), frozenset({48, 36}), frozenset({24, 40}), frozenset({0, 46}), frozenset({15, 7})], [frozenset({16, 32, 45}), frozenset({48, 36, 31}), frozenset({35, 18, 3}), frozenset({48, 12, 36}), frozenset({36, 12, 31}), frozenset({