In [54]:
import pandas as pd
import itertools
'''
Get Seed and repeat
'''
# Generate seed C1
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return [frozenset(i) for i in C1]


#Scan through datasets, remove the ones below min support
def scanD(D, Ck, minSupport):
    # Generate counts for the C
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt:
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    #print('Cn with count:',pd.DataFrame.from_dict(ssCnt,orient='index'))
    # Check weather meet minimum support or not, remove the ones below
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]#/numItems
        #print('support',support,'min_support',minSupport,'Bool',support >= minSupport)
        if support >= minSupport*numItems:
            retList.insert(0, key)
            supportData[key] = support
    #print('supportData:',pd.DataFrame.from_dict(supportData,orient='index'))
    #print('-'*30)
    return retList, supportData

def notRedundant(mergedlist,Lkm1,km1,l1,l2):
    sub_mergedlist=[set(i) for i in itertools.combinations(mergedlist,km1) if set(i) not in [l1,l2]]
    for i in sub_mergedlist:
        if i not in Lkm1:
            return False
    return True


#Generate Ck according to Lk
def aprioriGen(Lk, k):
    #print('AprioriGen Lk',Lk)
    #print('AprioriGen k',k)
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[: k-2]
            L2 = list(Lk[j])[: k-2]
            L1.sort()
            L2.sort()
            #If the all elements exept the last are same, merge two elements
            if L1 == L2:
                mergedlist=Lk[i] | Lk[j]
                #print('---------',mergedlist,k-1)
                if notRedundant(mergedlist,Lk,k-1,Lk[i],Lk[j]):
                    retList.append(mergedlist)
                #else:
                #    print('----Remove-----',mergedlist,k-1)
    return retList


def apriori(dataSet, minSupport=0.5):
    #Generate C1 
    C1 = createC1(dataSet)
    D=list(map(set,dataSet))
    L1, supportData = scanD(D, C1, minSupport)

    #print('supportData',pd.DataFrame(supportData.values(),supportData.keys()))
    #print('L1',(L1))
    #print('------Seed------'*5,'\n')

    #repeatively generate L and C
    L = [L1]
    k = 2 #help to put values in L
    while (len(L[k-2]) > 0):
        #print('L_',k-2,L[k-2])
        Ck = aprioriGen(L[k-2], k) 
        #print(pd.DataFrame({'Ck':Ck}))
        Lk, supK = scanD(D, Ck, minSupport) 
        #print(pd.DataFrame(supK.values(),supK.keys()))
        supportData.update(supK)
        #print('*'*10)
        if len(Lk) == 0:
            break
        # all the Lk generated along the way are put into L.
        L.append(Lk)
        k += 1
    return L, supportData


'''
Mine Association Rule
'''
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        #print('freqSet:',freqSet)
        #print('conseq:',conseq)
        #print('freqSet-conseq',freqSet-conseq)
        #print('conf:',conf)
        if conf >= minConf:
            #print(freqSet-conseq, '-->', conseq, 'conf:', conf)
            brl.append((freqSet,freqSet-conseq, conseq,supportData[freqSet-conseq],supportData[freqSet],supportData[conseq]))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    #print('*'*10*m,'H[0]',H[0])
    #print('H:',H)
    if (len(freqSet) >(m+1)):
        H = calcConf(freqSet, H, supportData, brl, minConf)
        Hmp1 = aprioriGen(H, m+1)
        #print('*'*20*m,Hmp1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        #print('Hmp1=', Hmp1)
        #print('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet))
        if (len(Hmp1) >= 1):
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

# Generate Association rules
def generateRules(L, supportData, minConf=0.7):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:#wont calculate with frequent item=1
            H1 = [frozenset([item]) for item in freqSet]
            #print('H1',H1)
            if (i > 1):#if only has two items, no need to split the relation
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
            #print('-'*30)
    return bigRuleList

def printrules(rules,total):
    C=['Frequent Itemset(AUB)','From(A)','To(B)','Confident Number(num of A)','Support Number(num of AUB)','Support Number(num of B)']
    Ruletable=pd.DataFrame(rules)
    Ruletable.columns=C
    Ruletable['Confidence']=Ruletable['Support Number(num of AUB)']/Ruletable['Confident Number(num of A)']
    Ruletable['Support']=Ruletable['Support Number(num of AUB)']/total
    Ruletable['Lift']=Ruletable['Support Number(num of AUB)']/Ruletable['Support Number(num of B)']
    return Ruletable

def printsupport(S):
    S_table=pd.DataFrame.from_dict(S,orient='index').reset_index()
    S_table.columns=['itemset','support']
    S_table
    return S_table

if __name__ == "__main__":
    dataSet = [['I1','I2','I5'],['I2','I4'],['I2','I3'],['I1','I2','I4'],['I1','I3'],['I2','I3'],['I1','I3'],['I1','I2','I3','I5'],['I1','I2','I3']]
    LI, S = apriori(dataSet, minSupport=1.9/9)
    rules = generateRules(LI, S, minConf=0)
    printrules(rules,len(dataSet))


In [13]:
#step by step:
dataSet = [['I1','I2','I5'],['I2','I4'],['I2','I3'],['I1','I2','I4'],['I1','I3'],['I2','I3'],['I1','I3'],['I1','I2','I3','I5'],['I1','I2','I3']]

In [56]:
T=printrules(rules,len(dataSet))
#T.sort_value()
T.sort_values('Frequent Itemset(AUB)',ascending=False)

Unnamed: 0,Frequent Itemset(AUB),From(A),To(B),Confident Number(num of A),Support Number(num of AUB),Support Number(num of B),Confidence,Support,Lift
12,"(I2, I1, I3)","(I1, I3)",(I2),4,2,7,0.5,0.222222,0.285714
1,"(I1, I3)",(I1),(I3),6,4,6,0.666667,0.444444,0.666667
22,"(I2, I1, I5)",(I1),"(I2, I5)",6,2,2,0.333333,0.222222,1.0
21,"(I2, I1, I5)",(I5),"(I2, I1)",2,2,4,1.0,0.222222,0.5
20,"(I2, I1, I5)","(I2, I1)",(I5),4,2,2,0.5,0.222222,1.0
19,"(I2, I1, I5)","(I2, I5)",(I1),2,2,6,1.0,0.222222,0.333333
18,"(I2, I1, I5)","(I1, I5)",(I2),2,2,7,1.0,0.222222,0.285714
17,"(I2, I1, I3)",(I2),"(I1, I3)",7,2,4,0.285714,0.222222,0.5
16,"(I2, I1, I3)",(I1),"(I2, I3)",6,2,4,0.333333,0.222222,0.5
15,"(I2, I1, I3)",(I3),"(I2, I1)",6,2,4,0.333333,0.222222,0.5


In [41]:
SelectT=T[['Confident number(num of A)','From(A)','To(B)']]
SelectT.head()


Unnamed: 0,Confident number(num of A),From(A),To(B)
0,6,(I3),(I1)
1,6,(I1),(I3)
2,6,(I3),(I2)
3,7,(I2),(I3)
4,2,(I4),(I2)


In [46]:
setcounts={k:SelectT.loc[i,'Confident number(num of A)'] for i,k in enumerate(SelectT['From(A)'])}
Bcounts=[setcounts[i] for i in SelectT['To(B)']]
print(setcounts,Bcounts)

{frozenset({'I3'}): 6, frozenset({'I1'}): 6, frozenset({'I2'}): 7, frozenset({'I4'}): 2, frozenset({'I5'}): 2, frozenset({'I1', 'I3'}): 4, frozenset({'I2', 'I3'}): 4, frozenset({'I2', 'I1'}): 4, frozenset({'I1', 'I5'}): 2, frozenset({'I2', 'I5'}): 2} [6, 6, 7, 6, 7, 2, 7, 6, 6, 2, 7, 2, 7, 6, 6, 4, 4, 4, 7, 6, 2, 4, 2, 2]


In [18]:
printsupport(S)

Unnamed: 0,itemset,support
0,(I1),6
1,(I2),7
2,(I5),2
3,(I4),2
4,(I3),6
5,"(I5, I2)",2
6,"(I5, I1)",2
7,"(I1, I2)",4
8,"(I4, I2)",2
9,"(I3, I2)",4


In [56]:
C=['Frequent itemset','From','To','Confident number','Support Number']
Ruletable=pd.DataFrame(rules)
Ruletable.columns=C
Ruletable['Confidence']=Ruletable['Support Number']/Ruletable['Confident number']
Ruletable

Unnamed: 0,Frequent itemset,From,To,Confident number,Support Number,Confidence
0,"(I3, I1)",(I1),(I3),0.666667,0.444444,0.666667
1,"(I3, I1)",(I3),(I1),0.666667,0.444444,0.666667
2,"(I3, I2)",(I2),(I3),0.777778,0.444444,0.571429
3,"(I3, I2)",(I3),(I2),0.666667,0.444444,0.666667
4,"(I2, I4)",(I4),(I2),0.222222,0.222222,1.0
5,"(I2, I4)",(I2),(I4),0.777778,0.222222,0.285714
6,"(I2, I1)",(I1),(I2),0.666667,0.444444,0.666667
7,"(I2, I1)",(I2),(I1),0.777778,0.444444,0.571429
8,"(I5, I1)",(I1),(I5),0.666667,0.222222,0.333333
9,"(I5, I1)",(I5),(I1),0.222222,0.222222,1.0


In [25]:
#print(LI)

c=0
for i in LI:
    for ii in i:
        c+=1
        print(ii)
print(c)

frozenset({'I3'})
frozenset({'I4'})
frozenset({'I5'})
frozenset({'I2'})
frozenset({'I1'})
frozenset({'I3', 'I1'})
frozenset({'I3', 'I2'})
frozenset({'I2', 'I4'})
frozenset({'I2', 'I1'})
frozenset({'I5', 'I1'})
frozenset({'I5', 'I2'})
frozenset({'I3', 'I2', 'I1'})
frozenset({'I5', 'I2', 'I1'})
13


In [10]:
with_S=pd.DataFrame(S.values(),S.keys()).reset_index()
with_S[0]=with_S[0]*len(dataSet)
with_S


Unnamed: 0,index,0
0,(I1),6.0
1,(I2),7.0
2,(I5),2.0
3,(I4),2.0
4,(I3),6.0
5,"(I5, I2)",2.0
6,"(I5, I1)",2.0
7,"(I2, I1)",4.0
8,"(I2, I4)",2.0
9,"(I3, I2)",4.0
