In [4]:
import pandas as pd
from apyori import apriori 
from mlxtend.preprocessing import TransactionEncoder

In [55]:
# Step 1: Data Collection
df = pd.read_excel("data/online_retail_II.xlsx")
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [121]:
# Step 2: Data processing

# drop missing values
missing_data = df.dropna(subset=["Description","StockCode","Invoice"])

# remove stockCodes which start with 'C'
rm_data = missing_data[~missing_data['StockCode'].astype(str).str.startswith('C')]
transactions = rm_data.groupby('Invoice')['Description'].apply(list).values

# convert transactions to list and remove duplicates
transaction_list = [list(set(transaction)) for transaction in transactions]
print(f"======> Total Transactions: {len(transaction_list)} rows  <======", )

# find with 10 records
transaction_limit = transaction_list[:7]
print("======> Transaction Data <======")

for transaction in transaction_limit:
    print(transaction)


['SAVE THE PLANET MUG', 'RECORD FRAME 7" SINGLE SIZE ', 'STRAWBERRY CERAMIC TRINKET BOX', 'PINK DOUGHNUT TRINKET POT ', 'PINK CHERRY LIGHTS', ' WHITE CHERRY LIGHTS', '15CM CHRISTMAS GLASS BALL 20 LIGHTS', 'FANCY FONT HOME SWEET HOME DOORMAT']
['DOG BOWL , CHASING BALL DESIGN', 'HEART MEASURING SPOONS LARGE', 'CAT BOWL ', 'LUNCHBOX WITH CUTLERY FAIRY CAKES ']
['DOOR MAT BLACK FLOCK ', 'HEART IVORY TRELLIS LARGE', 'LOVE BUILDING BLOCK WORD', 'FULL ENGLISH BREAKFAST PLATE', 'SMALL MARSHMALLOWS PINK BOWL', 'ASSORTED COLOUR BIRD ORNAMENT', 'HEART FILIGREE DOVE LARGE', 'PIZZA PLATE IN BOX', 'BISCUITS SMALL BOWL LIGHT BLUE', 'BATH BUILDING BLOCK WORD', 'HOME BUILDING BLOCK WORD', ' PEACE WOODEN BLOCK LETTERS', 'CHRISTMAS CRAFT WHITE FAIRY ', 'SET OF 3 BLACK FLYING DUCKS', 'AREA PATROLLED METAL SIGN', 'BLACK DINER WALL CLOCK', 'SCOTTIE DOG HOT WATER BOTTLE', 'CLASSIC WHITE FRAME', 'PLEASE ONE PERSON  METAL SIGN']
['STRIPES DESIGN MONKEY DOLL', 'PARTY CONE CHRISTMAS DECORATION ', 'FLORAL BLUE M

In [122]:
# Step 3 : Show Binary matrix and ensure all items in each transactions no duplicate

# ensure all items are string
str_transaction = [[str(item) for item in set(transaction)] for transaction in transactions]

tranEncode = TransactionEncoder()
tranEncodeArray = tranEncode.fit(transaction_limit).transform(transaction_limit)
binary_matrix = pd.DataFrame(tranEncodeArray, columns=tranEncode.columns_)
print("======> Binary Matrix <====== \n" , binary_matrix.astype(int))

     PEACE WOODEN BLOCK LETTERS   VINTAGE DESIGN GIFT TAGS  \
0                            0                          0   
1                            0                          0   
2                            1                          0   
3                            0                          0   
4                            0                          0   
5                            0                          1   
6                            0                          0   

    WHITE CHERRY LIGHTS  15CM CHRISTMAS GLASS BALL 20 LIGHTS  \
0                     1                                    1   
1                     0                                    0   
2                     0                                    0   
3                     0                                    0   
4                     0                                    0   
5                     0                                    0   
6                     0                                    0  

In [151]:
# Step 4: Generate Frequent Itemsets Using Apriori
min_confidence = 0.35
min_lift = 1
min_support = 0.02

results = list(apriori(str_transaction, min_support=min_support,min_confidence=min_confidence,min_lift=min_lift))

print(f"Size of Result: {len(results)}")
print("=====> Association Rules <=====")
for result in results:
    items = list(result.items)  
    support = result.support
    for ordered_stat in result.ordered_statistics:
        antecedent = list(ordered_stat.items_base)
        consequent = list(ordered_stat.items_add)
        confidence = ordered_stat.confidence
        lift = ordered_stat.lift
        if antecedent and consequent:
            print(f"Rule: {antecedent} -> {consequent}")
            print(f"Support: {support:.4f}\nConfidence: {confidence:.4f}\nLift: {lift:.4f}")
            print("-" * 30)


Size of Result: 9
=====> Association Rules <=====
Rule: ['60 TEATIME FAIRY CAKE CASES'] -> ['PACK OF 60 PINK PAISLEY CAKE CASES']
Support: 0.0230
Confidence: 0.4434
Lift: 9.8917
------------------------------
Rule: ['PACK OF 60 PINK PAISLEY CAKE CASES'] -> ['60 TEATIME FAIRY CAKE CASES']
Support: 0.0230
Confidence: 0.5129
Lift: 9.8917
------------------------------
Rule: ['60 TEATIME FAIRY CAKE CASES'] -> ['PACK OF 72 RETRO SPOT CAKE CASES']
Support: 0.0233
Confidence: 0.4501
Lift: 8.2085
------------------------------
Rule: ['PACK OF 72 RETRO SPOT CAKE CASES'] -> ['60 TEATIME FAIRY CAKE CASES']
Support: 0.0233
Confidence: 0.4257
Lift: 8.2085
------------------------------
Rule: ['HEART OF WICKER LARGE'] -> ['HEART OF WICKER SMALL']
Support: 0.0213
Confidence: 0.5009
Lift: 12.4889
------------------------------
Rule: ['HEART OF WICKER SMALL'] -> ['HEART OF WICKER LARGE']
Support: 0.0213
Confidence: 0.5318
Lift: 12.4889
------------------------------
Rule: ['HOME BUILDING BLOCK WORD'] -

In [152]:
# Step 5 :  display frequent Items in DataFrame

results_df = pd.DataFrame([(tuple(result.items), result.support) 
                           for result in results], 
                          columns=['Itemset', 'Support'])
print("===> Frequent DataFrame <====\n",results_df)



===> Frequent DataFrame <====
                                              Itemset   Support
0  (PACK OF 60 PINK PAISLEY CAKE CASES, 60 TEATIM...  0.022991
1  (PACK OF 72 RETRO SPOT CAKE CASES, 60 TEATIME ...  0.023338
2     (HEART OF WICKER LARGE, HEART OF WICKER SMALL)  0.021329
3  (LOVE BUILDING BLOCK WORD, HOME BUILDING BLOCK...  0.022682
4  (PACK OF 72 RETRO SPOT CAKE CASES, PACK OF 60 ...  0.020634
5  (RED HANGING HEART T-LIGHT HOLDER, WHITE HANGI...  0.030139
6  (STRAWBERRY CERAMIC TRINKET BOX, SWEETHEART CE...  0.031607
7  (WOODEN FRAME ANTIQUE WHITE , WHITE HANGING HE...  0.020672
8  (WOODEN FRAME ANTIQUE WHITE , WOODEN PICTURE F...  0.023532
