## 频繁项集

In [2]:

from numpy import *

import pandas as pd

# 将所有元素转换为frozenset型字典，存放到列表中

def createC1(dataSet):

    C1 = []

    for transaction in dataSet:

        for item in transaction:

            if not [item] in C1:

                C1.append([item])

    C1.sort()

    # 使用frozenset是为了后面可以将这些值作为字典的键

    return list(map(frozenset, C1))  # frozenset一种不可变的集合，set可变集合

 

# 过滤掉不符合支持度的集合

# 返回 频繁项集列表retList 所有元素的支持度字典

def scanD(D, Ck, minSupport):

    ssCnt = {}

    for tid in D:

        for can in Ck:

            if can.issubset(tid):   # 判断can是否是tid的《子集》 （这里使用子集的方式来判断两者的关系）

                if can not in ssCnt:    # 统计该值在整个记录中满足子集的次数（以字典的形式记录，frozenset为键）

                    ssCnt[can] = 1

                else:

                    ssCnt[can] += 1

    numItems = float(len(D))

    retList = []        # 重新记录满足条件的数据值（即支持度大于阈值的数据）

    supportData = {}    # 每个数据值的支持度

    for key in ssCnt:

        support = ssCnt[key] / numItems

        if support >= minSupport:

            retList.insert(0, key)

        supportData[key] = support

    return retList, supportData # 排除不符合支持度元素后的元素 每个元素支持度

 

# 生成所有可以组合的集合

# 频繁项集列表Lk 项集元素个数k  [frozenset({2, 3}), frozenset({3, 5})] -> [frozenset({2, 3, 5})]

def aprioriGen(Lk, k):

    retList = []

    lenLk = len(Lk)

    for i in range(lenLk): # 两层循环比较Lk中的每个元素与其它元素

        for j in range(i+1, lenLk):

            L1 = list(Lk[i])[:k-2]  # 将集合转为list后取值

            L2 = list(Lk[j])[:k-2]

            L1.sort(); L2.sort()        # 这里说明一下：该函数每次比较两个list的前k-2个元素，如果相同则求并集得到k个元素的集合

            if L1==L2:

                retList.append(Lk[i] | Lk[j]) # 求并集

    return retList  # 返回频繁项集列表Ck

 

# 封装所有步骤的函数

# 返回 所有满足大于阈值的组合 集合支持度列表

def apriori(dataSet, minSupport):

    D = list(map(set, dataSet)) # 转换列表记录为字典 形如[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]

    C1 = createC1(dataSet)      # 将每个元素转会为frozenset字典   形如 [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]

    L1, supportData = scanD(D, C1, minSupport)  # 过滤数据

    L = [L1]

    k = 2

    while (len(L[k-2]) > 0):    # 若仍有满足支持度的集合则继续做关联分析

        Ck = aprioriGen(L[k-2], k)  # Ck候选频繁项集

        Lk, supK = scanD(D, Ck, minSupport) # Lk频繁项集

        supportData.update(supK)    # 更新字典（把新出现的集合:支持度加入到supportData中）

        L.append(Lk)

        k += 1  # 每次新组合的元素都只增加了一个，所以k也+1（k表示元素个数）

    return L, supportData



## 关联规则

In [3]:

# 获取关联规则的封装函数

def generateRules(L, supportData, minConf=0.7):  # supportData 是一个字典

    bigRuleList = []

    for i in range(1, len(L)):  # 从为2个元素的集合开始

        for freqSet in L[i]:

            # 只包含单个元素的集合列表

            H1 = [frozenset([item]) for item in freqSet]    # frozenset({2, 3}) 转换为 [frozenset({2}), frozenset({3})]

            # 如果集合元素大于2个，则需要处理才能获得规则

            if (i > 1):

                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 集合元素 集合拆分后的列表 。。。

            else:

                calcConf(freqSet, H1, supportData, bigRuleList, minConf)

    return bigRuleList

 

# 对规则进行评估 获得满足最小可信度的关联规则

def calcConf(freqSet, H, supportData, brl, minConf=0.7):

    prunedH = []  # 创建一个新的列表去返回
    #f = open('short_output.txt','w+')
    for conseq in H:

        conf = supportData[freqSet]/supportData[freqSet-conseq]  # 计算置信度

        if conf >= minConf:

            print(freqSet-conseq,'-->',conseq,'conf:',conf)

            brl.append((freqSet-conseq, conseq, conf))

            prunedH.append(conseq)
    #f.close()

    return prunedH

 

# 生成候选规则集合

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):

    m = len(H[0])

    if (len(freqSet) > (m + 1)): # 尝试进一步合并

        Hmp1 = aprioriGen(H, m+1) # 将单个集合元素两两合并

        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)

        if (len(Hmp1) > 1):    #need at least two sets to merge

            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

## 申万一级行业涨跌分析

In [6]:
# DataFrame一行表示一个交易日各申万一级行业指数的涨跌情况，value的前1-2位表示行业，最后一位若=1，表示该交易日收盘价较上一交易日收盘价上涨或持平，若=2，反之。
sector_index=pd.read_csv('申万一级行业指数涨跌180101-200327d.csv',encoding="gbk")
sector_index['交易日'] = pd.to_datetime(sector_index['交易日'])
sector_index=sector_index.set_index('交易日')
for sector in sector_index.columns.to_list():
    sector_index[sector]=sector_index[sector].apply(lambda x:(sector_index.columns.get_loc(sector)+1)*10+1 if x>=0 else (sector_index.columns.get_loc(sector)+1)*10+2)
sector_index

Unnamed: 0_level_0,农林牧渔1,采掘2,化工3,钢铁4,有色金属5,电子6,家用电器7,食品饮料8,纺织服装9,轻工制造10,...,建筑装饰19,电气设备20,国防军工21,计算机22,传媒23,通信24,银行25,非银金融26,汽车27,机械设备28
交易日,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-27,12,21,32,41,52,62,71,81,91,101,...,191,202,212,222,232,242,251,261,272,281
2020-03-26,11,22,32,42,52,62,72,82,92,102,...,192,202,212,222,232,242,251,262,272,282
2020-03-25,11,21,31,41,51,61,71,81,91,101,...,191,201,211,221,231,241,251,261,271,281
2020-03-24,11,21,31,41,51,61,71,81,91,101,...,191,201,211,221,231,241,251,261,271,281
2020-03-23,12,22,32,42,52,62,72,82,92,102,...,192,202,212,222,232,242,252,262,272,282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-08,11,21,31,41,51,62,71,81,92,101,...,191,202,212,222,232,242,252,261,271,282
2018-01-05,11,22,31,42,52,62,71,81,92,101,...,191,202,212,222,231,242,251,262,272,282
2018-01-04,11,21,31,41,51,61,71,81,91,101,...,192,202,211,222,232,241,252,262,272,281
2018-01-03,11,21,31,41,51,61,71,81,91,101,...,191,201,211,221,231,241,251,262,271,281


In [5]:
def support_conf(df,minSupport,minConf):
    sector_DatSet=[]
    for i in range(df.shape[0]):
        sector_DatSet.append(set(df.iloc[i,:])) #读取dataframe的每行，每行为一个集合，所有集合放在一个list中
    L,suppData = apriori(sector_DatSet,minSupport=minSupport)
    #L表支持度>=最小阈值的项集
    #suppData表示项集及其对应的支持度
    print('置信度：')
    rules = generateRules(L,suppData,minConf=minConf)#conf表示置信度
    #rules表示在满足最小支持度阈值的情况下，满足最小置信度的关联关系
    pd_rules=pd.DataFrame(rules)
    pd_rules.rename(columns={0:'前项',1:'后项',2:'置信度'},inplace=True)
    pd_rules['组合'] = pd_rules.apply(lambda x: x['前项'].union(x['后项']), axis=1)
    pd_rules.insert(loc=pd_rules.columns.get_loc('后项')+1,column="支持度",value=1)
    pd_rules['支持度'] = pd_rules.apply(lambda x: suppData[x['组合']], axis=1)
    pd_rules['前项'] = pd_rules.apply(lambda x:list(x['前项'])[0], axis=1)
    pd_rules['后项'] = pd_rules.apply(lambda x:list(x['后项'])[0], axis=1)
    out_put=pd_rules.drop("组合",axis=1)
#     L_2=[]
#     for item in L[1]:   # 只查看两个元素的集合
#         if(item.intersection({31})): # intersection交集（选出含有{31}的满足最小阈值条件的两元素项集）
#             L_2.append(item)
    print("支持度：",'\n',suppData)
    return out_put

In [6]:
sectors_nonumber=[] 
for sector in sector_index.columns:#去掉列名中的数字
    #str='dsfsd09001'
    s=list(filter(lambda x:x not in '0123456789',sector))
    s1=''.join(s)
    sectors_nonumber.append(s1) 

In [7]:
def translate(df_output):#将结果用汉字解读出来，如282表示机械设备指数当日下降
    df_output['前项']=df_output['前项'].apply(lambda x: sectors_nonumber[int(x/10)-1]+'指数上升' if x-int(x/10)*10 == 1  else\
                                            sectors_nonumber[int(x/10)-1]+'指数下降')
    df_output['后项']=df_output['后项'].apply(lambda x: sectors_nonumber[int(x/10)-1]+'指数上升' if x-int(x/10)*10 == 1  else\
                                            sectors_nonumber[int(x/10)-1]+'指数下降')
    #return df_output

## 结果展示

#### 1 长时间分析

In [8]:
#2019-01-01到2020-03-27这段时间内的指数价格数据作输入，支持度阈值：minSupport，置信度阈值：minConf
long_input=sector_index
print('————长时间分析————')
long_out_put=support_conf(long_input,minSupport=0.4,minConf=0.9)
print("长时间关联分析最终结果：")
long_out_put.sort_values(by='置信度',axis=0,ascending=False, inplace=True)
translate(df_output=long_out_put)
long_out_put.to_csv("long_outputs.csv",index=False)

————长时间分析————
置信度：
frozenset({281}) --> frozenset({31}) conf: 0.9025270758122743
frozenset({282}) --> frozenset({192}) conf: 0.9097744360902256
frozenset({191}) --> frozenset({281}) conf: 0.902834008097166
支持度： 
 {frozenset({12}): 0.4990791896869245, frozenset({21}): 0.49355432780847147, frozenset({32}): 0.4714548802946593, frozenset({41}): 0.5009208103130756, frozenset({52}): 0.49171270718232046, frozenset({62}): 0.5027624309392266, frozenset({71}): 0.47882136279926335, frozenset({81}): 0.49171270718232046, frozenset({91}): 0.5211786372007366, frozenset({101}): 0.5248618784530387, frozenset({112}): 0.4953959484346225, frozenset({121}): 0.4972375690607735, frozenset({132}): 0.5285451197053407, frozenset({141}): 0.4714548802946593, frozenset({151}): 0.5101289134438306, frozenset({162}): 0.49171270718232046, frozenset({172}): 0.5046040515653776, frozenset({181}): 0.5193370165745856, frozenset({191}): 0.4548802946593002, frozenset({202}): 0.4972375690607735, frozenset({212}): 0.4861878453

In [9]:
long_out_put

Unnamed: 0,前项,后项,支持度,置信度
1,机械设备指数下降,建筑装饰指数下降,0.445672,0.909774
2,建筑装饰指数上升,机械设备指数上升,0.410681,0.902834
0,机械设备指数上升,化工指数上升,0.460405,0.902527


#### 2 短时间分析

In [10]:
#2019-12-26到2020-03-27这段时间内（新冠肺炎疫情期间）的指数价格数据作输入，支持度阈值：minSupport，置信度阈值：minConf
short_input=sector_index.truncate(after='2019-12-26').truncate(before='2020-03-27')
print('————短时间分析————')
short_out_put=support_conf(short_input,minSupport=0.5,minConf=0.9)
print("短时间关联分析最终结果：")
short_out_put.sort_values(by='置信度',axis=0,ascending=False, inplace=True)
translate(df_output=short_out_put)
short_out_put.to_csv("short_outputs.csv",index=False)

————短时间分析————
置信度：
frozenset({31}) --> frozenset({281}) conf: 0.9411764705882353
frozenset({281}) --> frozenset({31}) conf: 0.9142857142857143
frozenset({171}) --> frozenset({31}) conf: 0.9375
frozenset({201}) --> frozenset({281}) conf: 0.911764705882353
frozenset({281}) --> frozenset({221}) conf: 0.9142857142857143
frozenset({201}) --> frozenset({221}) conf: 0.9411764705882353
frozenset({231}) --> frozenset({281}) conf: 0.9696969696969696
frozenset({281}) --> frozenset({231}) conf: 0.9142857142857143
frozenset({231}) --> frozenset({31}) conf: 0.9393939393939394
frozenset({31}) --> frozenset({231}) conf: 0.911764705882353
frozenset({231}) --> frozenset({201}) conf: 0.9090909090909091
frozenset({231}) --> frozenset({221}) conf: 0.9090909090909091
frozenset({241}) --> frozenset({221}) conf: 0.9428571428571428
frozenset({271}) --> frozenset({281}) conf: 0.9090909090909091
frozenset({271}) --> frozenset({221}) conf: 0.9090909090909091
frozenset({271}) --> frozenset({241}) conf: 0.909090909

In [11]:
short_out_put

Unnamed: 0,前项,后项,支持度,置信度
6,传媒指数上升,机械设备指数上升,0.533333,0.969697
18,公用事业指数上升,建筑装饰指数上升,0.5,0.967742
12,通信指数上升,计算机指数上升,0.55,0.942857
5,电气设备指数上升,计算机指数上升,0.533333,0.941176
0,化工指数上升,机械设备指数上升,0.533333,0.941176
8,传媒指数上升,化工指数上升,0.516667,0.939394
2,综合指数上升,化工指数上升,0.5,0.9375
4,机械设备指数上升,计算机指数上升,0.533333,0.914286
7,机械设备指数上升,传媒指数上升,0.533333,0.914286
1,机械设备指数上升,化工指数上升,0.533333,0.914286
