# 關聯規則（apriori演算法）
### 思路：
### 1.以各商品做關聯規則太分散,因此以標籤（較大項的分類）來做,商品id先join tag
### 2.每張發票id只對到1個商品id,先group發票id,然後將tag用group_concat（）組合起來
### 3.把發票id,商品id欄位刪除,只留下各筆交易含的tag

In [23]:
def load_data_set():    #取出檔案內資料
    dataFile = open("3years_only_tag_over4200.csv", 'r')
    #dataFile = open("123.csv", 'r')         #123.csv為測試用的小量data
    data_set = []
    for line in dataFile.readlines():
        data_set.append(line.replace('\n', '').split(';'))

    return data_set

In [9]:
###############################################frozenset與set差異###############################################
#set表示集合，最重要的特性就是無序性且不重複
#set是可變的，沒有hash值,有add（），remove（）等方法
#frozenset是凍結的集合，它是不可變的，存在hash值，好處是它可以作為字典的key，也可以作為其它集合的元素,缺點是一旦創建便不能更改，沒有add，remove方法

data_set = [['a', 'b', 'd'], ['a', 'b', 'c', 'd'], ['a', 'b', 'd', 'e'],
   ['b', 'e', 'f'], ['a', 'b', 'd', 'f'], ['a', 'c', 'd', 'e']]

C_frSet = set()
C_set = set()

for t in data_set:
    for item in t:
        item_set = frozenset([item])
        C_frSet.add(item_set)
        C_set.add(item)
C_frSet
#C_set

{frozenset({'e'}),
 frozenset({'b'}),
 frozenset({'f'}),
 frozenset({'a'}),
 frozenset({'d'}),
 frozenset({'c'})}

In [14]:
def create_C1(data_set):
    C1 = set()
    for t in data_set:  
        for item in t:   #將每筆交易紀錄內的商品列出來
            item_set = frozenset([item])
            C1.add(item_set)
    return C1

#節錄C1：
#  {frozenset({'軟糖\t糖'}),
#   frozenset({'吸管\t背心袋'}),
#   frozenset({'冰棒工藝\t筆芯\t便條紙\t筆芯'})}

In [17]:
def is_apriori(Ck_item, Lksub1):       #判斷頻繁候選k項目集是否滿足Apriori屬性
    for item in Ck_item:
        sub_Ck = Ck_item - frozenset([item])
        if sub_Ck not in Lksub1:
            return False
    return True

In [18]:
def create_Ck(Lksub1, k):
    # 創建Ck，一個包含所有常見候選k-項集由Lk-1自己的連接操作
    # Lksub1：Lk-1，一個包含所有頻繁候選（k-1）項目的集合。
    # k：頻繁項目集的項目編號。
    # Ck：包含所有所有頻繁候選k項目集的集合。
    Ck = set()
    len_Lksub1 = len(Lksub1)
    list_Lksub1 = list(Lksub1)
    for i in range(len_Lksub1):
        for j in range(1, len_Lksub1):
            l1 = list(list_Lksub1[i])
            l2 = list(list_Lksub1[j])
            l1.sort()
            l2.sort()
            if l1[0:k-2] == l2[0:k-2]:
                Ck_item = list_Lksub1[i] | list_Lksub1[j]
                if is_apriori(Ck_item, Lksub1):
                    Ck.add(Ck_item)
    return Ck

In [15]:
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
    Lk = set()
    item_count = {}
    for t in data_set:
        for item in Ck:
            if item.issubset(t):
                if item not in item_count:
                    item_count[item] = 1
                else:
                    item_count[item] += 1
    t_num = float(len(data_set))
    for item in item_count:
        if (item_count[item] / t_num) >= min_support:    #找出支持度 > 設定之最小支持度
            Lk.add(item)
            support_data[item] = item_count[item] / t_num
    return Lk      #回傳大於最小支持度的商品集合

In [16]:
def generate_L(data_set, k, min_support):     #生成所有頻繁項目集
    support_data = {}
    C1 = create_C1(data_set)    #呼叫create_C1方法,回傳每筆交易紀錄內的商品列表
    L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
    Lksub1 = L1.copy()
    L = []
    L.append(Lksub1)
    for i in range(2, k+1):
        Ci = create_Ck(Lksub1, i)
        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)    #再次呼叫generate_Lk_by_Ck,持續找出高頻項目集
        Lksub1 = Li.copy()
        L.append(Lksub1)
    return L, support_data

In [19]:
def generate_big_rules(L, support_data, min_conf):    #生成大規則
    big_rule_list = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(freq_set):
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    big_rule = (freq_set - sub_set, sub_set, conf)
                    if conf >= min_conf and big_rule not in big_rule_list:
                        big_rule_list.append(big_rule)
            sub_set_list.append(freq_set)
    return big_rule_list

In [24]:
if __name__ == "__main__":
    data_set = load_data_set()
    L, support_data = generate_L(data_set, k=3, min_support=0.001)       #設定最小支持度0.001
    big_rules_list = generate_big_rules(L, support_data, min_conf=0.03)  #設定最小信賴度0.03
    for Lk in L:
        #str(len(list(Lk)[0]))
        print ("="*50)
        print ("frequent " + "-itemsets\t\tsupport")
        print ("="*50)
        for freq_set in Lk:
            print (freq_set, support_data[freq_set])
    print()
    print ("Big Rules")
    for item in big_rules_list:
        print (item[0], "->", item[1], "conf: ", item[2])

frequent -itemsets		support
frozenset({'護墊'}) 0.002763041702467938
frozenset({'訂書針'}) 0.01148244402755363
frozenset({'膠帶台'}) 0.0022414133216474455
frozenset({'蔬菜餅'}) 0.0010803267988566542
frozenset({'牙線'}) 0.004958117477595796
frozenset({'牙膏'}) 0.01493790106801425
frozenset({'祝福卡'}) 0.007500062886670276
frozenset({'馬桶刷'}) 0.0014642664700189455
frozenset({'瓦楞板'}) 0.0038936778375458577
frozenset({'玩具'}) 0.002425439577825233
frozenset({'玻璃'}) 0.0030635737898949728
frozenset({'彩色筆'}) 0.014508947780232932
frozenset({'電池'}) 0.018030601315721536
frozenset({'拖鞋'}) 0.007609949068416726
frozenset({'紙膠帶'}) 0.02264052522947015
frozenset({'瓦楞'}) 0.008114366360529942
frozenset({'米餅'}) 0.001162410452691351
frozenset({'標籤清除劑'}) 0.0021633014575144277
frozenset({'湯匙'}) 0.0010776789390555348
frozenset({'泡棉材料'}) 0.003344246928813613
frozenset({'面霜'}) 0.002278483358863115
frozenset({'便利袋'}) 0.0028676321646121483
frozenset({'刮刮卡'}) 0.0049567935476952365
frozenset({'收納盒'}) 0.0033972041248359983
frozenset({'面

frozenset({'長尾夾'}) -> frozenset({'修正帶'}) conf:  0.11548855070402503
frozenset({'泡棉'}) -> frozenset({'背心袋'}) conf:  0.23429610179761665
frozenset({'白板'}) -> frozenset({'背心袋'}) conf:  0.12680683311432325
frozenset({'橡皮擦'}) -> frozenset({'背心袋'}) conf:  0.12223144188421524
frozenset({'魔擦筆'}) -> frozenset({'自動鉛筆'}) conf:  0.031820095425203476
frozenset({'自動鉛筆'}) -> frozenset({'魔擦筆'}) conf:  0.05084365715567016
frozenset({'訂書針'}) -> frozenset({'背心袋'}) conf:  0.1551942811022714
frozenset({'海綿'}) -> frozenset({'背心袋'}) conf:  0.28582089552238804
frozenset({'釘書機'}) -> frozenset({'原子筆'}) conf:  0.1585441085749537
frozenset({'白板筆'}) -> frozenset({'修正帶'}) conf:  0.0791361744870139
frozenset({'中油筆'}) -> frozenset({'原子筆'}) conf:  0.18793665579878902
frozenset({'奇異筆'}) -> frozenset({'背心袋'}) conf:  0.12276004119464469
frozenset({'巧克力'}) -> frozenset({'筆芯'}) conf:  0.048368298368298375
frozenset({'牛奶'}) -> frozenset({'巧克力'}) conf:  0.16490891658676896
frozenset({'巧克力'}) -> frozenset({'牛奶'}) conf:  0.046