### P073 关联规则 - 购物篮数据拆分

In [23]:
import numpy as np
import pandas as pd

In [24]:
data = {'products': ['bread eggs', 'bread eggs milk', 'milk cheese', 
                     'bread butter cheese', 'eggs milk', 
                     'bread milk butter cheese']}


In [25]:
data

{'products': ['bread eggs',
  'bread eggs milk',
  'milk cheese',
  'bread butter cheese',
  'eggs milk',
  'bread milk butter cheese']}

In [26]:
transactions = pd.DataFrame(data=data, index=range(1,7))

In [27]:
transactions

Unnamed: 0,products
1,bread eggs
2,bread eggs milk
3,milk cheese
4,bread butter cheese
5,eggs milk
6,bread milk butter cheese


In [28]:
expanded = transactions["products"].str.split(expand=True)

In [29]:
expanded

Unnamed: 0,0,1,2,3
1,bread,eggs,,
2,bread,eggs,milk,
3,milk,cheese,,
4,bread,butter,cheese,
5,eggs,milk,,
6,bread,milk,butter,cheese


### P074 关联规则 - 计算购买商品的去重列表


In [34]:
products = set()

In [35]:
for column in expanded.columns:
    for product in expanded[column].unique():
        if product:
            products.add(product)
products

{'bread', 'butter', 'cheese', 'eggs', 'milk'}

In [36]:
products = sorted(list(products))
products

['bread', 'butter', 'cheese', 'eggs', 'milk']

### P075 关联规则 - 实现one-hot编码

In [37]:
expanded

Unnamed: 0,0,1,2,3
1,bread,eggs,,
2,bread,eggs,milk,
3,milk,cheese,,
4,bread,butter,cheese,
5,eggs,milk,,
6,bread,milk,butter,cheese


In [38]:
products

['bread', 'butter', 'cheese', 'eggs', 'milk']

In [39]:
transactions_encoded = np.zeros(
    (len(expanded), len(products)), dtype='int8'
)

In [40]:
transactions_encoded

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int8)

In [43]:
for row in zip(range(len(expanded)), expanded.values):
    print(row[0], row[1])
    for idx, product in enumerate(products):
        # print(idx, product)
        if product in row[1]:
            transactions_encoded[row[0], idx] = 1
transactions_encoded

0 ['bread' 'eggs' None None]
1 ['bread' 'eggs' 'milk' None]
2 ['milk' 'cheese' None None]
3 ['bread' 'butter' 'cheese' None]
4 ['eggs' 'milk' None None]
5 ['bread' 'milk' 'butter' 'cheese']


array([[1, 0, 0, 1, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 1, 0, 1],
       [1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [1, 1, 1, 0, 1]], dtype=int8)

In [44]:
transactions_encoded_df = pd.DataFrame(
    transactions_encoded,
    columns=products
)

In [45]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


### P076 关联规则挖掘 - 计算商品的支持度

In [46]:
transactions_encoded_df.sum()

bread     4
butter    2
cheese    3
eggs      3
milk      4
dtype: int64

In [47]:
support = transactions_encoded_df.sum() / len(transactions_encoded_df)
support

bread     0.666667
butter    0.333333
cheese    0.500000
eggs      0.500000
milk      0.666667
dtype: float64

### P077 关联规则挖掘 - 计算多个商品的支持度

In [48]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


In [50]:
sup_butter_bread = (
    len(transactions_encoded_df.query("butter==1 and bread==1"))
    / 
    len(transactions_encoded_df)
)
sup_butter_bread

0.3333333333333333

In [51]:
sup_butter_milk = (
    len(transactions_encoded_df.query("butter==1 and milk==1"))
    / 
    len(transactions_encoded_df)
)
sup_butter_milk

0.16666666666666666

### P078 关联规则挖掘 - 计算关联规则以及置信度

In [52]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


In [53]:
conf_cheese_brand = (
    len(transactions_encoded_df.query("cheese==1 and bread==1")) 
    /
    len(transactions_encoded_df.query("cheese==1")) 
)
conf_cheese_brand

0.6666666666666666

In [54]:
conf_butter_cheese = (
    len(transactions_encoded_df.query("butter==1 and cheese==1")) 
    /
    len(transactions_encoded_df.query("butter==1")) 
)
conf_butter_cheese

1.0