In [1]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import fpgrowth, association_rules

In [2]:
df = pd.read_csv('Online_Retail_5000.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
# gom theo InvoiceNo va StockCode de dem so luong tung mat hang trong moi hoa don
invoice_stockcode = df.groupby(['InvoiceNo', 'StockCode']).size().reset_index(name='Count')
invoice_stockcode.head()

Unnamed: 0,InvoiceNo,StockCode,Count
0,536365,21730,1
1,536365,22752,1
2,536365,71053,1
3,536365,84029E,1
4,536365,84029G,1


In [None]:
# tính tần số xuất hiện của từng mặt hàng (StockCode) trong các hóa đơn (InvoiceNo)
stockcode_freq = df.groupby('StockCode')['InvoiceNo'].nunique().reset_index(name='Freq')
stockcode_freq.head()

Unnamed: 0,StockCode,Freq
0,10002,3
1,10124G,1
2,10125,1
3,10133,1
4,10135,2


In [None]:
# Tính support cho từng mặt hàng
total_invoices = df['InvoiceNo'].nunique()
stockcode_freq['Support'] = stockcode_freq['Freq'] / total_invoices
stockcode_freq.head()

Unnamed: 0,StockCode,Freq,Support
0,10002,3,0.01
1,10124G,1,0.003333
2,10125,1,0.003333
3,10133,1,0.003333
4,10135,2,0.006667


In [7]:
stockcode_freq_sorted = stockcode_freq.sort_values(by='Support', ascending=False)
stockcode_freq_sorted.head(10)

Unnamed: 0,StockCode,Freq,Support
949,22632,38,0.126667
1477,85123A,35,0.116667
950,22633,32,0.106667
1326,84029E,30,0.1
1174,22961,27,0.09
1033,22752,26,0.086667
1327,84029G,24,0.08
1116,22866,23,0.076667
1217,37370,22,0.073333
1115,22865,22,0.073333


In [8]:
transactions = []
invoices = df['InvoiceNo'].unique()

for iv in invoices:
    items = df[df['InvoiceNo'] == iv]['StockCode'].unique().tolist()
    transactions.append(items)

print('Total transactions:', len(transactions))

Total transactions: 300


In [10]:
for tr in transactions[:5]:
    print(tr)

['85123A ', '71053 ', '84406B ', '84029G ', '84029E ', '22752 ', '21730 ']
['22633 ', '22632 ']
['84879 ', '22745 ', '22748 ', '22749 ', '22310 ', '84969 ', '22623 ', '22622 ', '21754 ', '21755 ', '21777 ', '48187 ']
['22960 ', '22913 ', '22912 ', '22914 ']
['21756 ']


In [11]:
from mlxtend.preprocessing import TransactionEncoder
encoder = TransactionEncoder()
onehot = encoder.fit(transactions).transform(transactions)
onehot_df = pd.DataFrame(onehot, columns=encoder.columns_)
onehot_df.head()

Unnamed: 0,10002,10124G,10125,10133,10135,11001,15036,15044B,15056BL,15056N,15056P,15060B,16012,16014,16016,16046,16156S,16168M,16235,16236,16237,16238,16258A,17003,17011F,17012A,17012B,17012C,17012D,17012E,17012F,17014A,17021,17084N,17084P,17090A,17090D,17091A,17129F,17164B,...,90178A,90180A,90181B,90185B,90185C,90186A,90190C,90192,90194,90195A,90195B,90196A,90199A,90199C,90200A,90200B,90200C,90200D,90204,90206C,90209B,90209C,90210B,90211A,90214A,90214D,90214E,90214G,90214H,90214J,90214M,90214R,90214S,90214V,BANK CHARGES,C2,D,DOT,M,POST
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [12]:
# tạo tập các tập mục thường xuyên với ngưỡng support tối thiểu là 0.05
frequent_itemsets = fpgrowth(onehot_df, min_support=0.05, use_colnames=True)
fp_rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.05)
fp_rules_sorted = fp_rules.sort_values(by='support', ascending=False)
print('Number of rules generated:', len(fp_rules_sorted))

Number of rules generated: 13604


In [13]:
print(fp_rules_sorted.head(10))

     antecedents         consequents  ...  certainty  kulczynski
2992    (22632 )            (22633 )  ...   0.646504    0.748355
2993    (22633 )            (22632 )  ...   0.785305    0.748355
12     (84029G )           (84029E )  ...   0.907407    0.825000
13     (84029E )           (84029G )  ...   0.710145    0.825000
0      (85123A )           (84029E )  ...   0.555556    0.650000
15     (85123A )           (84029G )  ...   0.565217    0.737500
1      (84029E )           (85123A )  ...   0.660377    0.650000
14     (84029G )           (85123A )  ...   0.858491    0.737500
23     (84029E )  (84029G , 85123A )  ...   0.641577    0.809524
22     (85123A )  (84029G , 84029E )  ...   0.537513    0.740260

[10 rows x 14 columns]


In [15]:
for i, (index, rule) in enumerate(fp_rules_sorted.head(10).iterrows()):
    X = ', '.join(list(rule['antecedents']))
    Y = ', '.join(list(rule['consequents']))
    support_ = rule['support']
    confidence_ = rule['confidence']
    print(f'Rule {i+1}: {X} -> {Y} (Support: {support_:.4f}, Confidence: {confidence_:.4f})')

Rule 1: 22632  -> 22633  (Support: 0.0867, Confidence: 0.6842)
Rule 2: 22633  -> 22632  (Support: 0.0867, Confidence: 0.8125)
Rule 3: 84029G  -> 84029E  (Support: 0.0733, Confidence: 0.9167)
Rule 4: 84029E  -> 84029G  (Support: 0.0733, Confidence: 0.7333)
Rule 5: 85123A  -> 84029E  (Support: 0.0700, Confidence: 0.6000)
Rule 6: 85123A  -> 84029G  (Support: 0.0700, Confidence: 0.6000)
Rule 7: 84029E  -> 85123A  (Support: 0.0700, Confidence: 0.7000)
Rule 8: 84029G  -> 85123A  (Support: 0.0700, Confidence: 0.8750)
Rule 9: 84029E  -> 84029G , 85123A  (Support: 0.0667, Confidence: 0.6667)
Rule 10: 85123A  -> 84029G , 84029E  (Support: 0.0667, Confidence: 0.5714)
