In [9]:
import pandas as pd
import itertools

'''
Get Seed and repeat
'''
# Generate seed C1
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return [frozenset(i) for i in C1]


#Scan through datasets, remove the ones below min support
def scanD(D, Ck, minSupport):
    # Generate counts for the C
    #print('D',D)
    #print('Ck',Ck)
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt:
                    ssCnt[can] = 1
                else:
                    ssCnt[can] += 1
    #print('Cn with count:',pd.DataFrame.from_dict(ssCnt,orient='index'))
    
    # Check weather meet minimum support or not, remove the ones below
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]
        #print('support',support,'min_support',minSupport,'Bool',support >= minSupport)
        if support >= minSupport*numItems:
            retList.insert(0, key)
            supportData[key] = support
    #print('supportData:',pd.DataFrame.from_dict(supportData,orient='index'))
    #print('-'*30)
    return retList, supportData

def notRedundant(mergedlist,Lkm1,km1,l1,l2):
    sub_mergedlist=[set(i) for i in itertools.combinations(mergedlist,km1) if set(i) not in [l1,l2]]
    for i in sub_mergedlist:
        if i not in Lkm1:
            return False
    return True


#Generate Ck according to Lk
def aprioriGen(Lk, k):
    #print('AprioriGen Lk',Lk)
    #print('AprioriGen k',k)
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[: k-2]
            L2 = list(Lk[j])[: k-2]
            L1.sort()
            L2.sort()
            #If the all elements exept the last are same, merge two elements
            if L1 == L2:
                mergedlist=Lk[i] | Lk[j]
                #print('---------',mergedlist,k-1)
                if notRedundant(mergedlist,Lk,k-1,Lk[i],Lk[j]):
                    retList.append(mergedlist)
                #else:
                #    print('----Remove-----',mergedlist,k-1)
    return retList


def apriori(dataSet, minSupport=0.5):
    #Generate C1 
    C1 = createC1(dataSet)
    D=list(map(set,dataSet))
    L1, supportData = scanD(D, C1, minSupport)

    #print('supportData',pd.DataFrame(supportData.values(),supportData.keys()))
    #print('L1',(L1))
    #print('------Seed------'*5,'\n')

    #repeatively generate L and C
    L = [L1]
    k = 2 #help to put values in L
    while (len(L[k-2]) > 0):
        #print('L_',k-2,L[k-2])
        Ck = aprioriGen(L[k-2], k) 
        #print(pd.DataFrame({'Ck':Ck}))
        Lk, supK = scanD(D, Ck, minSupport) 
        #print(pd.DataFrame(supK.values(),supK.keys()))
        supportData.update(supK)
        #print('*'*10)
        if len(Lk) == 0:
            break
        # all the Lk generated along the way are put into L.
        L.append(Lk)
        k += 1
    return L, supportData


'''
Mine Association Rule
'''
#the real calculation from the table
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        #print('freqSet:',freqSet)
        #print('conseq:',conseq)
        #print('freqSet-conseq',freqSet-conseq)
        #print('conf:',conf)
        if conf >= minConf:
            #print(freqSet-conseq, '-->', conseq, 'conf:', conf)
            brl.append((freqSet,freqSet-conseq, conseq,supportData[freqSet-conseq],supportData[freqSet],supportData[conseq]))
            prunedH.append(conseq)
    return prunedH

# Generate set
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    #print('*'*10,H[0])
    m = len(H[0])
    if (len(freqSet) > (m + 1)):
        H = calcConf(freqSet, H, supportData, brl, minConf)
        Hmp1 = aprioriGen(H, m+1)
        #print('*'*20,Hmp1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        #print('Hmp1=', Hmp1)
        #print('len(Hmp1)=', len(Hmp1), 'len(freqSet)=', len(freqSet))
        if (len(Hmp1) >= 1):
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

# Combine all rules
def generateRules(L, supportData, minConf=0.7):
    bigRuleList = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            #print(H1)
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
            #print('-'*30)
    return bigRuleList

def printrules(rules,total):
    C=['Frequent Itemset(AUB)','From(A)','To(B)','Confident Number(num of A)','Support Number(num of AUB)','Support Number(num of B)']
    Ruletable=pd.DataFrame(rules)
    Ruletable.columns=C
    Ruletable['Confidence']=Ruletable['Support Number(num of AUB)']/Ruletable['Confident Number(num of A)']
    Ruletable['Support']=Ruletable['Support Number(num of AUB)']/total
    Ruletable['Lift']=Ruletable['Support Number(num of AUB)']/Ruletable['Support Number(num of B)']
    return Ruletable

def printsupport(S):
    S_table=pd.DataFrame.from_dict(S,orient='index').reset_index()
    S_table.columns=['itemset','support']
    S_table
    return S_table


#partition
def partition(fulldataSet,n,minSupport):#n specific how many partitions you want
    split=[i for i in range(0,len(fulldataSet),round(len(fulldataSet)/n))]
    A=[(split[i],split[i+1]) if i+1!=len(split) else (split[i],len(fulldataSet)) for i in range(len(split))]
    partitionL=[]
    #print(partitionL)
    for i in A:
        sub_dataset=fulldataSet[i[0]:i[1]]
        LI,_ = apriori(sub_dataset, minSupport)
        flat_list = [item for sublist in LI for item in sublist]
        partitionL=partitionL+flat_list
    #print(len(set(partitionL)))
    #print('***last scan***'*8)
    alllist, allsupport=scanD(fulldataSet,list(set(partitionL)),minSupport)
    return alllist,allsupport

if __name__ == "__main__":
    dataSet = [['I1','I2','I5'],['I2','I4'],['I2','I3'],['I1','I2','I4'],['I1','I3'],['I2','I3'],['I1','I3'],['I1','I2','I3','I5'],['I1','I2','I3']]
    LI, S = apriori(dataSet, minSupport=1.9/9)
    #print(LI)
    LP,SP=partition(dataSet,3,minSupport=1.9/9)
    rules = generateRules(LI, S, minConf=0.5)
    #printrules(rules,len(dataSet))

In [10]:
printrules(rules,len(dataSet))



Unnamed: 0,Frequent Itemset(AUB),From(A),To(B),Confident Number(num of A),Support Number(num of AUB),Support Number(num of B),Confidence,Support,Lift
0,"(I3, I1)",(I1),(I3),6,4,6,0.666667,0.444444,0.666667
1,"(I3, I1)",(I3),(I1),6,4,6,0.666667,0.444444,0.666667
2,"(I3, I2)",(I2),(I3),7,4,6,0.571429,0.444444,0.666667
3,"(I3, I2)",(I3),(I2),6,4,7,0.666667,0.444444,0.571429
4,"(I4, I2)",(I4),(I2),2,2,7,1.0,0.222222,0.285714
5,"(I1, I2)",(I2),(I1),7,4,6,0.571429,0.444444,0.666667
6,"(I1, I2)",(I1),(I2),6,4,7,0.666667,0.444444,0.571429
7,"(I1, I5)",(I5),(I1),2,2,6,1.0,0.222222,0.333333
8,"(I2, I5)",(I5),(I2),2,2,7,1.0,0.222222,0.285714
9,"(I3, I1, I2)","(I1, I2)",(I3),4,2,6,0.5,0.222222,0.333333


In [3]:
split=[i for i in range(0,len(dataSet),round(len(dataSet)/3))]
A=[(split[i],split[i+1]) if i+1!=len(split) else (split[i],len(dataSet)) for i in range(len(split))]
partitionL=[]
#print(partitionL)
for i in A:
    sub_dataset=dataSet[i[0]:i[1]]
    LI,_ = apriori(sub_dataset, minSupport=1.9/9)
    flat_list = [item for sublist in LI for item in sublist]
    partitionL=partitionL+flat_list
print(len(set(partitionL)))
print('***last scan***'*8)
alllist, allsupport=scanD(dataSet,list(set(partitionL)),1.9/9)
'''
#print(pd.DataFrame({'key':allsupport.keys(),'value':allsupport.values()}))
print(pd.DataFrame(allsupport.values(),allsupport.keys()))'''

D [{'I5', 'I2', 'I1'}, {'I4', 'I2'}, {'I2', 'I3'}]
Ck [frozenset({'I1'}), frozenset({'I2'}), frozenset({'I3'}), frozenset({'I4'}), frozenset({'I5'})]
Cn with count:       0
(I1)  1
(I2)  3
(I5)  1
(I4)  1
(I3)  1
supportData:              0
(I1)  0.333333
(I2)  1.000000
(I5)  0.333333
(I4)  0.333333
(I3)  0.333333
------------------------------
D [{'I5', 'I2', 'I1'}, {'I4', 'I2'}, {'I2', 'I3'}]
Ck [frozenset({'I4', 'I3'}), frozenset({'I5', 'I3'}), frozenset({'I2', 'I3'}), frozenset({'I1', 'I3'}), frozenset({'I4', 'I5'}), frozenset({'I4', 'I2'}), frozenset({'I4', 'I1'}), frozenset({'I5', 'I2'}), frozenset({'I5', 'I1'}), frozenset({'I2', 'I1'})]
Cn with count:           0
(I5, I2)  1
(I5, I1)  1
(I2, I1)  1
(I4, I2)  1
(I2, I3)  1
supportData:                  0
(I5, I2)  0.333333
(I5, I1)  0.333333
(I2, I1)  0.333333
(I4, I2)  0.333333
(I2, I3)  0.333333
------------------------------
D [{'I5', 'I2', 'I1'}, {'I4', 'I2'}, {'I2', 'I3'}]
Ck [frozenset({'I5', 'I2', 'I1'})]
Cn with count:   

"\n#print(pd.DataFrame({'key':allsupport.keys(),'value':allsupport.values()}))\nprint(pd.DataFrame(allsupport.values(),allsupport.keys()))"

In [4]:
def printtable(support):
    subdatasetrun=pd.DataFrame.from_dict(support,orient='index')
    subdatasetrun.reset_index(inplace=True)
    c=['itemset','support']
    subdatasetrun.columns=c
    subdatasetrun['len']=[len(i) for i in subdatasetrun['itemset']]
    return subdatasetrun.sort_values('len').reset_index(drop=True)

In [5]:
printtable(S)

Unnamed: 0,itemset,support,len
0,(I1),0.666667,1
1,(I2),0.777778,1
2,(I5),0.222222,1
3,(I4),0.222222,1
4,(I3),0.666667,1
5,"(I5, I2)",0.222222,2
6,"(I5, I1)",0.222222,2
7,"(I2, I1)",0.444444,2
8,"(I4, I2)",0.222222,2
9,"(I2, I3)",0.444444,2


In [6]:
printtable(allsupport)

Unnamed: 0,itemset,support,len
0,(I2),0.777778,1
1,(I1),0.666667,1
2,(I5),0.222222,1
3,(I4),0.222222,1
4,(I3),0.666667,1
5,"(I2, I1)",0.444444,2
6,"(I5, I1)",0.222222,2
7,"(I5, I2)",0.222222,2
8,"(I4, I2)",0.222222,2
9,"(I2, I3)",0.444444,2


In [9]:
all([S[i]==allsupport[i] for i in S if i in allsupport])

True