#  使用Apriori算法进行关联分析

In [10]:
def loadDataSet():
    return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
def createC1(dataSet):
    C1=[]
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    print(C1)
    # forzenset生成不可变的集合，如果不提供任何参数默认会产生空集合。
    #如果提供一个参数，则该参数必须是可迭代的，即，一个序列，或迭代器，或支持迭代的一个对象。
    return list(map(frozenset,C1))


In [11]:
dataSet=loadDataSet()
dataSet

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [14]:
C1=createC1(dataSet)
C1

[[1], [2], [3], [4], [5]]


[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [22]:
def scanD(D,Ck,minSupport):
    # 三个参数分别为：数据集，候选项集列表和最小支持度
    ssCnt={}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if  can not  in ssCnt:
                    ssCnt[can]=1
                else:
                    ssCnt[can]+=1
    numItems=float(len(D))
    retList=[]
    supportData={}
    for key in ssCnt:
        support=ssCnt[key]/numItems # 计算每项的支持度
        if support>=minSupport:
            retList.insert(0,key)
        supportData[key]=support
    return retList,supportData
                        
                    

In [23]:
D=list(map(set,dataSet))
D

[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]

In [24]:
# 去掉哪些不满足最小支持度的项集
L1,supportData0 =scanD(D,C1,0.5) # 去除了4，值出现了一次
L1 

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [34]:
# 组织完整的Apriori算法
def aprioriGen(Lk,k) :# creats Ck
    # Lk 频繁项集列表，k 项集元素个数
    retList=[]
    lenLk=len(Lk)
    for i in range(lenLk):
         for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2:
                retList.append(Lk[i] | Lk[j]) #set union
    return retList
def apriori(dataSet,minSupport=0.5):
    C1=createC1(dataSet)
    D=list(map(set,dataSet))
    L1,supportData=scanD(D,C1,minSupport)
    L=[L1]
    k=2
    while (len(L[k-2])>0):
        Ck=aprioriGen(L[k-2],k)
        Lk,supK=scanD(D,Ck,minSupport)
        supportData.update(supK)
        L.append(Lk)
        k+=1
    return L,supportData

In [35]:
L,supportData=apriori(dataSet)
L

[[1], [2], [3], [4], [5]]


[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

In [37]:
# 从频繁项集中挖掘关联规则
def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    # 三个参数 ：频繁项集列表、包含哪些频繁项集支持数据的字典，最小可信度阈值。
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]# 包含单个元素集合的列表H1
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList  
#生成候规则集
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH
# 对规则进行评估
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

In [38]:
rules=generateRules(L, supportData, minConf=0.5)
rules

frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666


[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

# 发现毒蘑菇的相似特征

In [39]:
# 收集数据
mushDatSet = [line.split() for line in open("./mushroom.dat").readlines()]
print(len(mushDataSet))

FileNotFoundError: [Errno 2] No such file or directory: './mushroom.dat'

In [40]:

L, suppData = apriori(mushDatSet, minSupport=0.3)
for item in L[1]:
    if item.intersection('2'):
        print(item)

NameError: name 'mushDatSet' is not defined