# 關聯規則分析

三個指標：

* support
* confidence
* lift

使用的套件：

* [mlxtend](https://pypi.org/project/mlxtend/)

In [None]:
# 安裝套件
!pip install mlxtend

In [None]:
import pandas as pd

# 傳入模型的資料需要滿足特定的格式，可以用這種方法來轉換為
# boolean值，也可以用函式轉換為 0、1
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

## 建立測試資料

In [1]:
dataset = [['牛奶','麵包','餅乾','柳橙汁'],
      ['麵包','餅乾','汽水','泡麵'],
      ['牛奶','餅乾','水果'],
      ['牛奶','麵包','柳橙汁','泡麵','水果'],
      ['餅乾','汽水','水果']]

---

### 複習一下Python的list

In [None]:
x = [1, 2, 3]
y = [4, 5, 6]
x + y

[1, 2, 3, 4, 5, 6]

---

In [None]:
x = []
for y in dataset:
  x += y # x = x + y

In [None]:
# 看一下 list 裡面有幾筆資料
len(x)

19

In [None]:
y = list(set(x))

In [None]:
len(y)

7

In [None]:
y

['餅乾', '柳橙汁', '牛奶', '汽水', '水果', '麵包', '泡麵']

## 資料格式轉換

In [None]:
# 由於 mlxtend的模型只接受特定的資料格式。
# TransactionEncoder()：每個值轉換為一個唯一的 boolean值
oht = TransactionEncoder()  #定義模型物件one-hot-encoding
oht_ary = oht.fit(dataset).transform(dataset)

# 顯示：True/False
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df

Unnamed: 0,柳橙汁,水果,汽水,泡麵,牛奶,餅乾,麵包
0,True,False,False,False,True,True,True
1,False,False,True,True,False,True,True
2,False,True,False,False,True,True,False
3,True,True,False,True,True,False,True
4,False,True,True,False,False,True,False


In [None]:
# 顯示：1/0
df2 = pd.DataFrame(oht_ary.astype("int"), columns=oht.columns_)
df2

Unnamed: 0,柳橙汁,水果,汽水,泡麵,牛奶,餅乾,麵包
0,1,0,0,0,1,1,1
1,0,0,1,1,0,1,1
2,0,1,0,0,1,1,0
3,1,1,0,1,1,0,1
4,0,1,1,0,0,1,0


In [None]:
# 自己手動算算看 support 的數值
df2[['牛奶','麵包']].all(axis=1).sum()/len(df2)

0.4

In [None]:
# 有買牛奶的訂單
df2[df2['牛奶']==1]

Unnamed: 0,柳橙汁,水果,汽水,泡麵,牛奶,餅乾,麵包
0,1,0,0,0,1,1,1
2,0,1,0,0,1,1,0
3,1,1,0,1,1,0,1


In [None]:
# 裡面有幾張訂單有麵包
df2[df2['牛奶']==1]['麵包'].sum()

2

In [None]:
# 手動計算 confidence
df2[df2['牛奶']==1]['麵包'].sum()/len(df2[df2['牛奶']==1])

0.6666666666666666

In [None]:
# 所有訂單裡面有麵包的機率
df2['麵包'].sum()/len(df2)

0.6

In [None]:
# 手動計算 Lift
c = df2[df2['牛奶']==1]['麵包'].sum()/len(df2[df2['牛奶']==1])
s = df2['麵包'].sum()/len(df2)
c / s

1.1111111111111112

## 計算頻繁項目集

In [2]:
# use_colnames=True表示使用元素名字，預設的 False使用 Column名代表元素
# max_len=None：最大物品組合數，預設是 None，不做限制。
# 如果需要計算最多三個物品組合的話，就將 max_len這個值設置為 3。
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True, max_len=None)
frequent_itemsets

NameError: name 'apriori' is not defined

In [None]:
#頻繁項目集可以按支持度大小排序
frequent_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
5,0.8,(餅乾)
4,0.6,(牛奶)
6,0.6,(麵包)
1,0.6,(水果)
0,0.4,(柳橙汁)
11,0.4,"(餅乾, 汽水)"
15,0.4,"(餅乾, 麵包)"
14,0.4,"(麵包, 牛奶)"
13,0.4,"(餅乾, 牛奶)"
12,0.4,"(泡麵, 麵包)"


In [None]:
frequent_itemsets.itemsets[0]

frozenset({'柳橙汁'})

In [None]:
# 計算項目數目
frequent_itemsets.itemsets.apply(lambda x: len(x))

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     2
8     2
9     2
10    2
11    2
12    2
13    2
14    2
15    2
16    3
Name: itemsets, dtype: int64

In [None]:
frequent_itemsets.itemsets.apply(lambda x: len(x)) >= 2

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
Name: itemsets, dtype: bool

In [None]:
#選擇長度 >=2 的頻繁項目集
frequent_itemsets[frequent_itemsets.itemsets.apply(lambda x: len(x)) >= 2]

Unnamed: 0,support,itemsets
7,0.4,"(柳橙汁, 牛奶)"
8,0.4,"(柳橙汁, 麵包)"
9,0.4,"(水果, 牛奶)"
10,0.4,"(餅乾, 水果)"
11,0.4,"(餅乾, 汽水)"
12,0.4,"(泡麵, 麵包)"
13,0.4,"(餅乾, 牛奶)"
14,0.4,"(麵包, 牛奶)"
15,0.4,"(餅乾, 麵包)"
16,0.4,"(柳橙汁, 麵包, 牛奶)"


---

### 什麼是 lamdba 函式 (匿名函式)

In [None]:
def square(x):
  return x**2

In [None]:
square(2)

4

In [None]:
def square1(x): return x**2

In [None]:
square1(2)

4

In [None]:
square2 = lambda x: x**2

In [None]:
square2(2)

4

In [None]:
x = [1, 2, 3]

In [None]:
list(map(square, x))

[1, 4, 9]

In [None]:
list(map(lambda x: x**2, x))

[1, 4, 9]

In [None]:
import numpy as np

In [None]:
x1 = np.array(x)

In [None]:
x1**2

array([1, 4, 9])

---

## 計算關聯規則

In [None]:
from mlxtend.frequent_patterns import association_rules

# metric可以有很多的度量選項，返回的 list/confidence/...名都可以作為引數
rules = association_rules(frequent_itemsets,
                          metric="lift",
                          min_threshold=1.2)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(柳橙汁),(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
1,(牛奶),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
2,(柳橙汁),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
3,(麵包),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
4,(餅乾),(汽水),0.8,0.4,0.4,0.5,1.25,0.08,1.2
5,(汽水),(餅乾),0.4,0.8,0.4,1.0,1.25,0.08,inf
6,(泡麵),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
7,(麵包),(泡麵),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
8,"(柳橙汁, 麵包)",(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
9,"(柳橙汁, 牛奶)",(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf


In [None]:
df2[df2['牛奶']==1]

Unnamed: 0,柳橙汁,水果,汽水,泡麵,牛奶,餅乾,麵包
0,1,0,0,0,1,1,1
2,0,1,0,0,1,1,0
3,1,1,0,1,1,0,1


In [None]:
df2[df2['柳橙汁']==1]

Unnamed: 0,柳橙汁,水果,汽水,泡麵,牛奶,餅乾,麵包
0,1,0,0,0,1,1,1
3,1,1,0,1,1,0,1


In [None]:
#關聯規則可以按 confidence/lift/...排序
rules.sort_values(by='confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(柳橙汁),(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
2,(柳橙汁),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
5,(汽水),(餅乾),0.4,0.8,0.4,1.0,1.25,0.08,inf
6,(泡麵),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
8,"(柳橙汁, 麵包)",(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
9,"(柳橙汁, 牛奶)",(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
10,"(麵包, 牛奶)",(柳橙汁),0.4,0.4,0.4,1.0,2.5,0.24,inf
11,(柳橙汁),"(麵包, 牛奶)",0.4,0.4,0.4,1.0,2.5,0.24,inf
1,(牛奶),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
3,(麵包),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8


In [None]:
#選取 lift>1.5且 confidence>0.66的規則，按 lift降冪排序
rules[(rules['lift']>=1.5) & (rules['confidence']>=0.66)].sort_values("lift", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,"(麵包, 牛奶)",(柳橙汁),0.4,0.4,0.4,1.0,2.5,0.24,inf
11,(柳橙汁),"(麵包, 牛奶)",0.4,0.4,0.4,1.0,2.5,0.24,inf
0,(柳橙汁),(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
1,(牛奶),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
2,(柳橙汁),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
3,(麵包),(柳橙汁),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
6,(泡麵),(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
7,(麵包),(泡麵),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
8,"(柳橙汁, 麵包)",(牛奶),0.4,0.6,0.4,1.0,1.666667,0.16,inf
9,"(柳橙汁, 牛奶)",(麵包),0.4,0.6,0.4,1.0,1.666667,0.16,inf
