In [70]:
import pandas as pd
from matplotlib import pyplot as plt

def getReceiptMbs():
    df = pd.read_csv("data/75000i.csv",names = ['recpt_id','quantity','item'])
    
    # mbs: market baskets; maps the receipt number to a set of all the items purchased
    mbs = {}
    for row in df.values:
        item_id = row[2]
        r_id = row[0]

        if(r_id not in mbs):
            mbs[r_id] = set()

        mbs[r_id].add(item_id)
    
    return mbs

In [71]:
def getItemSets():
    df = pd.read_csv("data/1000i.csv",names = ['recpt_id','quantity','item'])
    
    return set(df['item'])

In [72]:
# Returns support of itemset
# Checks how many marketbaskets contain the itemset
def getSupport(itemset,mbs):
    count = 0
    for mb in mbs:
        if (itemset.issubset(mb)):
            count+=1
    return count/len(mbs)

In [84]:
"""
mbs: marketbaskets; map of receipt number to market basket
itemset: set of all items
minSup: minimum support number

return:
"""
def apriori(mbs, itemset, minSup):
    F = [] # list of F1, F2, ..., Fn
    F1 = [] # list of all item sets of length 1 where the support of the item set > minSup
    
    for item in itemset:            
        itemSup = getSupport(set([item]), mbs)
        if(itemSup >= minSup):
             F1.append(set([item]))
                
    F.append(F1)
    
    k = 1 #index to iterate F, eg. F[0] == F1
    while(len(F[k-1]) > 0):
        Ck = candidateGen(F[k-1], k-1) # candidate frequent itemsets of length k+1
        Fk = []
        
        for candidate in Ck:
            count = 0
            for mb in mbs:
                if(candidate.issubset(mb)):
                    count += 1

            if(count/len(mbs) >= minSup):
                Fk.append(candidate)
        
        F.append(Fk)   
        k += 1
                    
    return F

In [74]:
# Passing in arrray of itemsets of length k
# the size/length of the item sets K
# return: set of candidate frequent item sets of length k+1
def candidateGen(Fk, k):
    candidates = set()
    finalCandidates = set()
    
    #generate candidates of length k+1
    for itemset1 in Fk:
        for itemset2 in Fk:
            # check len(set) == k?
            union = itemset1.union(itemset2)
            if( (itemset1 is not itemset2) and (len(union) == len(itemset1) + 1) ):
                candidates.add(frozenset(union))
    
    #prune candidates
    for cand in candidates:
        isValid = True
        for item in cand:
            prunedCand = set([c for c in cand if c != item])
            if (prunedCand not in Fk):
                isValid = False
                continue;
        if (isValid):
            finalCandidates.add(cand)
            
    return finalCandidates 

In [75]:
def maximal(itemsets):
    all_itemsets = []
    maximal = []
    
    for itemset_list in itemsets:
        for itemset in itemset_list:
            all_itemsets.append(set(itemset))
    
    for itemset1 in all_itemsets:
        isMaximal = True
        for itemset2 in all_itemsets:
            if itemset1 is not itemset2 and itemset1.issubset(itemset2):
                isMaximal = False
        if isMaximal:
            maximal.append(itemset1)
    
    return maximal

In [76]:
def genRules(mbs, F, minConf):
    H1 = []
    
    for itemset in F:
        if len(itemset) < 2:
            continue;
        
        for item in itemset:
            conf = getSupport(itemset, mbs.values()) / getSupport(itemset - set([item]), mbs.values())
            if conf >= minConf:
                H1.append([itemset - set([item]), item])
        
    return H1   

In [77]:
def get_goods():
    goods = pd.read_csv("data/goods.csv")
    goods = goods[['Flavor','Food']]
    good_labels = []
    for row in goods.values:
        foodItem = row[0].replace("'","") + " " + row[1].replace("'","")
        good_labels.append(foodItem)
    return good_labels

In [78]:
def report_rules(maxRules):
    translated = []
    goods = get_goods()
    for rule in maxRules:
        left = list(rule[0])
        new_left = []
        for item in left:
            new_left.append(goods[item])
        new_right = goods[rule[1]]
        translated.append([new_left,new_right])
    return translated

In [79]:
def report_itemsets(maxItemsets):
    goods = get_goods()
    translated = []
    for itemset in maxItemsets:
        new_itemset = []
        for item in list(itemset):
            new_itemset.append(goods[item])
        translated.append(new_itemset)
    return translated
            

In [89]:
def bakery_main():
    mbs = getReceiptMbs()
    itemsets = getItemSets()
    frequent_itemsets = apriori(mbs.values(),itemsets, .02)
    maximal_itemsets = maximal(frequent_itemsets)
    labeled_itemsets = report_itemsets(maximal_itemsets)
    #Optimal confidence and support will make 10-50 maximal rules
    maximal_rules = genRules(mbs, maximal_itemsets, .75)
    labeled_rules = report_rules(maximal_rules)

    rule_supports = []
    rule_confidences = []

    #Report itemsets and supports
    for i in range(len(maximal_itemsets)):
        support = getSupport(maximal_itemsets[i], mbs.values())
        print(labeled_itemsets[i], " has support: ", round(support,3))

    #Report rules, support of rules, and confidence of the rules
    rule_supports = []
    rule_confidences = []
    for i in range(len(maximal_rules)):
        itemset = maximal_rules[i][0].union([maximal_rules[i][1]])
        support = getSupport(itemset, mbs.values())
        confidence = getSupport(itemset, mbs.values()) / getSupport((maximal_rules[i][0]),
                                                                    mbs.values())
        rule_supports.append(support)
        rule_confidences.append(confidence)

        print("rule: ", labeled_rules[i][0], "--> ",
              labeled_rules[i][1],
              "\nSupport:",
              round(support,3),
              "Confidence:",
              round(confidence,3))

In [90]:
bakery_main()

['Chocolate Eclair']  has support:  0.042
['Vanilla Eclair']  has support:  0.043
['Almond Tart']  has support:  0.042
['Apricot Tart']  has support:  0.042
['Pecan Tart']  has support:  0.043
['Ganache Cookie']  has support:  0.043
['Chocolate Meringue']  has support:  0.042
['Vanilla Meringue']  has support:  0.042
['Almond Croissant']  has support:  0.043
['Chocolate Croissant']  has support:  0.043
['Almond Bear Claw']  has support:  0.042
['Blueberry Danish']  has support:  0.044
['Truffle Cake', 'Gongolais Cookie']  has support:  0.044
['Cheese Croissant', 'Orange Juice']  has support:  0.043
['Marzipan Cookie', 'Tuile Cookie']  has support:  0.051
['Napoleon Cake', 'Strawberry Cake']  has support:  0.043
['Bottled Water', 'Berry Tart']  has support:  0.038
['Lemon Cake', 'Lemon Tart']  has support:  0.037
['Blueberry Tart', 'Apricot Croissant', 'Hot Coffee']  has support:  0.033
['Single Espresso', 'Coffee Eclair', 'Blackberry Tart']  has support:  0.027
['Chocolate Cake', 'Casi

In [91]:
authors = pd.read_csv("data/authorList.psv", sep="|",names= ['id','name'])
f = open("data/bingoBaskets.csv", "r")
baskets = []
for line in f:
    tokens = line.split(",")
    basket= tokens[1:]
    basket[len(basket)-1] =  basket[len(basket)-1].replace("\n","")
    basket = [int(item.strip()) for item in basket]
    baskets.append(basket)

In [92]:
itemsets = set(authors['id'])

In [95]:
frequent_itemsets = apriori(baskets,itemsets, .20)
#maximal_itemsets = maximal(frequent_itemsets)
#labeled_itemsets = report_itemsets(maximal_itemsets)

In [96]:
frequent_itemsets

[[{2},
  {13},
  {48},
  {88},
  {91},
  {166},
  {240},
  {368},
  {445},
  {576},
  {644},
  {743},
  {747},
  {880},
  {960},
  {1029},
  {1085},
  {1109},
  {1217},
  {1279},
  {1283}],
 [frozenset({91, 1109})],
 []]