In [1]:
#加载一个测试集
def loadDataSet():
    return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]

In [2]:
#测试
dataset=loadDataSet()
dataset

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [3]:
#根据dataset生成 k—1 的项集
#返回值  所有k—1 候选集的集合  [frozenset([1]),frozenset([2])]
def createCandidateSet(dataset):
    result=[]  #存放 k—1 项集 
    for transaction in dataset:
        for item in transaction:
            if not [item] in result:
                result.append( [item] )  # [ [1],[3],[4],[2],[5]]
    result.sort()
    #这里一定要用  fronzenset  而不是 set 因为以后要将这些集合作为字典  必须是不可以改变的对象  而 [] 是可变的  所以 要转换成{} 才行
    return map(frozenset,result)
    

In [4]:
candidateset=createCandidateSet(dataset)
#print(candidateset)
#print(list(candidateset))

In [5]:
'''
函数说明：生成频繁的项选集及支持度列表
参数如下：
    dataSet : 数据集(事务集)
    c  ：候选项列表
    minSupport ：最小支持度
算法描述 ：支持度 = {项集}次数 /总记录数

返回值：
   resultList 频繁项集  [[项1,项2...],[项1,项2...],[项1,项2...]...]
   supportData 各频繁项集的支持字典  {[频繁项集]:支持度,[频繁项集]:支持度,[频繁项集]:支持度,[频繁项集]:支持度,}
'''

def scanDataSet(dataSet, candidateSet,minSupport=0.5):
    z=list(candidateSet)  #将 map 转为list  
    numItems=float(len(dataSet)) #统计总共有多少条事务记录
    candidateCounter={}  #  {候选集1:支持的计数,  候选集2:支持的计数,  候选集2:支持的计数,}
    #循环交易记录
    for tid in dataSet:
        #循环候选项集
        for candidate in z:  #  frozenset({1})  ->  set 集合
            #  如果候选项是记录的一部分  给这个候选项增加计数
            if candidate.issubset(tid):   #  issubset   判断集合 x 的所有元素是否都包含在集合 y 中
                candidateCounter[candidate] = candidateCounter.get(candidate,0)+1
                
    resultList=[]  #存频繁项集
    supportData={ }   #各频繁项集的支持字典  频繁项集指的是  大于最小支持度的项集
    for key in candidateCounter:
        #支持度
        support=candidateCounter[key] / numItems  #每个候选集的次数  /总事务数  -》支持度
        if support >= minSupport:
            resultList.insert(0,key)  #保存
        supportData[key] = support
    return resultList,supportData

In [6]:
'''
Apriori 算法的伪代码
    当集合中项的个数大于0时
        构建一个由k个项组成的候选项集的列表(k从1开始)
        计算候选项集的支持度删除非频繁项集
        构建由 k+1 项组成的候选项集的列表
函数功能说明：生成候选项集
参数说明：
    频繁项集列表 ：requenceList
    项集个数 : k
'''
def aprioriGeneratorRequenceList(requenceList,k):
    retList=[]
    requenceListLength=len(requenceList)
    for i in range(requenceListLength):
        for j in range(i+1,requenceListLength):
            
            L1=list(requenceList[i])[:k-2]
            L2=list(requenceList[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1==L2:
                retList.append(requenceList[i] | requenceList[j])
    return retList

In [7]:
resultList,supportData=scanDataSet(dataset,candidateset)
print(resultList,supportData)

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})] {frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75}


In [8]:
'''
apriori主要算法实现
返回值：1.频繁项集
supportData  支持度列表
'''
def apriori(dataSet,minSupport=0.5):
    candidateSet = createCandidateSet(dataSet)  #生成 k1 候选项集
    L1,supportData=scanDataSet(dataSet,candidateSet,minSupport) #生成频繁一项项集
    L=[L1]
    
    k=2
    while(len(L[k-2])>0):
        Ck=aprioriGeneratorRequenceList(L[k-2],k)#依次生成  2,3,4，.... 项候选项集
        Lk,supK=scanDataSet(dataSet,Ck,minSupport)  #一句minsupport进行减枝  生成  k  项 频繁项集  及支持度列表
        supportData.update(supK)
        L.append(Lk)
        k+=1
    return L,supportData
    

In [9]:
L,supportData=apriori(dataset)
print(L)
print(supportData)

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})], []]
{frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({2, 3, 5}): 0.5}


In [12]:
'''
功能描述：计算机规则置信度，并过滤出满足最小置信度要求的规则
    参数：freqSet：频繁项集  
          premise：构建 对于频繁项集{2，3}  构建为{2}，{3}
          supportData：所有候选项集的支持度列表
          rules：生成的满足最小置信度要求的关联规则
          minconf：最小置信度
'''
def calcConf(freqSet,premise,supportData,rules,minConf=0.7):
    '''对候选规则集进行评估'''
    prunedH=[]  #满足最小置信度要求的规则列表
    for conseq in premise:
        conf=supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            print("前提：",freqSet-conseq,'-->',conseq,'此置信度为:',conf)
            rules.append( (freqSet-conseq,conseq,conf) )
            prunedH.append(conseq)
    return prunedH   #生成结果集合


'''
功能描述：根据当前候选规则集H生成下一层候选规则集
    参数：freqSet：k项频繁项集
          premise：构建
          supportData：所有候选项集的支持度列表
          rules：生成的关联规则
          minConf：最小置信度
'''
def rulesFromConseq(freqSet,premise,supportData,rules,minConf=0.7):
    '''生成候选规则集'''
    m=len(premise[0])
    #每个构件有多种组合  所以每种都要计算置信度
    while len(freqSet)>m:  #控制循环的次数
        premise=calcConf(freqSet,premise,supportData,rules,minConf)
        #  Apriori  使用一种逐层的方法来产生关联规则牟其中每层对应于规则后件的项数
        #则通过合并这两条规则的后件产生候选规则{ad}-{bc}
        if len(premise)>1:
            premise=aprioriGeneratorRequenceList(premise,m+1)  #  fk-1   fk-1
            m+=1
        else:   #不能生成下一层候选关联规则   提前退出循环
            break
            
            
'''
功能描述：关联规则生成函数
    参数：freqSet ：频繁项集
        supportData：所有候选项集的支持度列表
        minConf：最小置信度
        返回值：一个包含可信度的规则列表
'''
def generateRules(freqSet,supportData,minConf=0.7):
    bigRuleList=[]
    for i in range(1,len(freqSet) ):   # 从 k2 频繁项列表开始循环构建规则
        for fs in freqSet[i]: #  fs   ：  frozenset({2, 3}), frozenset({3, 5})
            H1=[frozenset([item])for item in fs]  #[frozenset({2}),frozenset({3})]
            rulesFromConseq(fs,H1,supportData,bigRuleList,minConf)
    return bigRuleList

In [13]:
#测试
#1.加载数据集
dataSet=loadDataSet()
#2.利用apriori算法求频繁项集，支持度列表
requenceList,supporData=apriori(dataSet,minSupport=0.5)
#3.利用频繁项集 及支持度列表来求有效规则
rules=generateRules(requenceList,supporData,minConf=0.7)
print(rules)

前提： frozenset({5}) --> frozenset({2}) 此置信度为: 1.0
前提： frozenset({2}) --> frozenset({5}) 此置信度为: 1.0
前提： frozenset({1}) --> frozenset({3}) 此置信度为: 1.0
前提： frozenset({3, 5}) --> frozenset({2}) 此置信度为: 1.0
前提： frozenset({2, 3}) --> frozenset({5}) 此置信度为: 1.0
[(frozenset({5}), frozenset({2}), 1.0), (frozenset({2}), frozenset({5}), 1.0), (frozenset({1}), frozenset({3}), 1.0), (frozenset({3, 5}), frozenset({2}), 1.0), (frozenset({2, 3}), frozenset({5}), 1.0)]


#### 案例  发现蘑菇的相似特征   
#### 需求：只对包含某个特定元素项感兴趣

In [17]:
mushDataSet=[line.split() for line in open("mushroom.dat").readlines()]
requenceList,suppData=apriori(mushDataSet,minSupport=0.3)
print(requenceList)

[[frozenset({'58'}), frozenset({'56'}), frozenset({'116'}), frozenset({'6'}), frozenset({'110'}), frozenset({'94'}), frozenset({'53'}), frozenset({'28'}), frozenset({'24'}), frozenset({'10'}), frozenset({'39'}), frozenset({'2'}), frozenset({'93'}), frozenset({'90'}), frozenset({'9'}), frozenset({'86'}), frozenset({'85'}), frozenset({'76'}), frozenset({'67'}), frozenset({'63'}), frozenset({'59'}), frozenset({'52'}), frozenset({'38'}), frozenset({'36'}), frozenset({'34'}), frozenset({'3'}), frozenset({'23'}), frozenset({'1'})], [frozenset({'85', '58'}), frozenset({'1', '24'}), frozenset({'36', '53'}), frozenset({'36', '116'}), frozenset({'36', '56'}), frozenset({'39', '56'}), frozenset({'24', '110'}), frozenset({'53', '110'}), frozenset({'34', '116'}), frozenset({'85', '116'}), frozenset({'86', '116'}), frozenset({'90', '116'}), frozenset({'34', '56'}), frozenset({'56', '85'}), frozenset({'86', '56'}), frozenset({'90', '56'}), frozenset({'56', '116'}), frozenset({'6', '36'}), frozenset({

In [18]:
#搜索包含有毒特征值2的频繁项集
for item in requenceList[1]:    # requenceList：[[1频繁项集],[2频繁项集]]
    if item.intersection('2'):
        print(item)

frozenset({'2', '28'})
frozenset({'2', '53'})
frozenset({'2', '23'})
frozenset({'2', '34'})
frozenset({'2', '36'})
frozenset({'2', '59'})
frozenset({'2', '63'})
frozenset({'67', '2'})
frozenset({'2', '76'})
frozenset({'2', '85'})
frozenset({'2', '86'})
frozenset({'2', '90'})
frozenset({'2', '93'})
frozenset({'2', '39'})


In [19]:
#对更大的项集进行搜索
for item in requenceList[2]:
    if item.intersection("2"):
        print(item)

frozenset({'2', '39', '53'})
frozenset({'2', '53', '90'})
frozenset({'2', '53', '86'})
frozenset({'2', '85', '53'})
frozenset({'2', '53', '34'})
frozenset({'2', '28', '39'})
frozenset({'2', '28', '90'})
frozenset({'2', '28', '86'})
frozenset({'2', '28', '85'})
frozenset({'2', '28', '63'})
frozenset({'2', '28', '59'})
frozenset({'2', '28', '34'})
frozenset({'2', '28', '53'})
frozenset({'2', '39', '93'})
frozenset({'2', '39', '90'})
frozenset({'2', '93', '90'})
frozenset({'2', '39', '86'})
frozenset({'2', '93', '86'})
frozenset({'2', '90', '86'})
frozenset({'2', '39', '85'})
frozenset({'2', '93', '85'})
frozenset({'2', '85', '90'})
frozenset({'2', '85', '86'})
frozenset({'2', '39', '76'})
frozenset({'2', '76', '86'})
frozenset({'2', '85', '76'})
frozenset({'67', '2', '86'})
frozenset({'67', '2', '85'})
frozenset({'67', '2', '34'})
frozenset({'67', '2', '39'})
frozenset({'2', '39', '63'})
frozenset({'2', '93', '63'})
frozenset({'2', '63', '90'})
frozenset({'2', '63', '86'})
frozenset({'2'

In [20]:
#总共一起有多少种项集  k
len(requenceList)

10

In [15]:
'''
F(k-1)  与 F(1)

resultList  F(1)

requenceList  F(k-1)

生成的是候选集项！！！！不是经过筛选过的项集
'''
def aprioriGeneratorRequenceList(requenceList,resultListk,k):
    retList=[]
    requenceListLength=len(requenceList)
    cand=list(candidateset)
    for i in range(requenceListLength):
        L1=list(requenceList[i])
        for z in requenceList:
            if not [z] in L1:
                L1.append(z)
                L1.sort()
                if not L1 in retList:
                    retList.append(L1)
    return retList    

In [None]:
'''
进行阀值判断 
del删除没到阀值的值

'''
def apriorChoose(requenceList,dataset,minsupport=0.6):
    numItem=float(len(dataset))
    candidateCounter={}
    for requence in requenceList:
        for data in dataset:
            if set(requence).issubset(set(data)):   #  issubset   判断集合 x 的所有元素是否都包含在集合 y 中
                candidateCounter[candidate] = candidateCounter.get(candidate,0)+1
    resultList=[]  #频繁项集 
    supportDate={} #这个是支持度
    for key in candidateCounter:
        support= candidateCounter[key]/numItem
        if support > minsupport:
            resultList.insert(0,key)
            supportDate[key]=support
    return resultList  supportDate
    

In [25]:
z=[[1,3,4],[2,3,5],[1,2,3,5],[2,5]]


if ({2,7}).issubset(set(z[1])):
    print('yes')
else:
    print('no')

no
