In [1]:
import numpy as np
import pandas as pd
import requests
import timeit

from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules


In [2]:
url = "https://github.com/dbdmg/data-science-lab/raw/master/datasets/online_retail.csv"

file_path = "online_retail.csv"

# Download the file
response = requests.get(url)


with open(file_path, "wb") as file:
    file.write(response.content)
    print(f"File downloaded successfully: {file_path}")



File downloaded successfully: online_retail.csv


***1. First, you need to load the dataset into memory, using the csv module. Make sure you identify all
valid rows. Also consider that rows having an InvoiceNo that starts with C should be discarded, as
they indicate that the invoice is about a cancelled purchase.***

In [3]:
df = pd.read_csv('online_retail.csv')

df = df[df['InvoiceNo'].apply(lambda x: not x.lower().startswith('c'))]

***2. Now that you have a dataset of items, you should aggregate it at an “invoice” level. For each invoice
(identified by InvoiceNo) there can be multiple items (from multiple rows) in the dataset.***

```
[ "GARDENERS KNEELING PAD KEEP CALM",
  "HOT WATER BOTTLE KEEP CALM",
  "DOORMAT KEEP CALM AND COME IN" ]
```

In [4]:
# Grouping items by InvoiceNo
grouped = df.groupby('InvoiceNo')['Description'].apply(list).reset_index()

# Display the first few invoices with their items
grouped.head()


Unnamed: 0,InvoiceNo,Description
0,536365,"[WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET..."
1,536366,"[HAND WARMER UNION JACK, HAND WARMER RED POLKA..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHO..."
3,536368,"[JAM MAKING SET WITH JARS, RED COAT RACK PARIS..."
4,536369,[BATH BUILDING BLOCK WORD]


***3. Convert the list of items for each invoice into a matrix format where:***
- Rows represent invoices.
- Columns represent unique items.
- Each cell is 1 if the invoice contains the item, else 0.

In [5]:
# Get the list of unique items
all_items = df['Description'].unique()

# Initialize an empty matrix
pa_matrix = []

# Loop through each invoice to create the matrix
for items in grouped['Description']:
    row = [1 if item in items else 0 for item in all_items]
    pa_matrix.append(row)

# Create a DataFrame
df_matrix = pd.DataFrame(pa_matrix, columns=all_items)

# Display the matrix
print(df_matrix.head())


   WHITE HANGING HEART T-LIGHT HOLDER  WHITE METAL LANTERN  \
0                                   1                    1   
1                                   0                    0   
2                                   0                    0   
3                                   0                    0   
4                                   0                    0   

   CREAM CUPID HEARTS COAT HANGER  KNITTED UNION FLAG HOT WATER BOTTLE  \
0                               1                                    1   
1                               0                                    0   
2                               0                                    0   
3                               0                                    0   
4                               0                                    0   

   RED WOOLLY HOTTIE WHITE HEART.  SET 7 BABUSHKA NESTING BOXES  \
0                               1                             1   
1                               0               

***Step 4: Apply FP-Growth Algorithm*** <br><br>
Use the **fpgrowth()** function from Mlxtend to find frequent itemsets.

In [6]:
# Find frequent itemsets with different minsup values
for minsup in [0.5, 0.1, 0.05, 0.02, 0.01]:
    fi = fpgrowth(df_matrix, min_support=minsup, use_colnames=True)
    print(f"\nMin Support = {minsup}")
    print(f"Number of itemsets found: {len(fi)}")
    print(fi.to_string())




Min Support = 0.5
Number of itemsets found: 0
Empty DataFrame
Columns: [support, itemsets]
Index: []





Min Support = 0.1
Number of itemsets found: 1
    support                              itemsets
0  0.102429  (WHITE HANGING HEART T-LIGHT HOLDER)





Min Support = 0.05
Number of itemsets found: 23
     support                              itemsets
0   0.102429  (WHITE HANGING HEART T-LIGHT HOLDER)
1   0.065945       (ASSORTED COLOUR BIRD ORNAMENT)
2   0.051351            (JAM MAKING SET WITH JARS)
3   0.051033                             (POSTAGE)
4   0.052574     (PAPER CHAIN KIT 50'S CHRISTMAS )
5   0.070885             (LUNCH BAG RED RETROSPOT)
6   0.059826     (PACK OF 72 RETROSPOT CAKE CASES)
7   0.055203             (JUMBO BAG PINK POLKADOT)
8   0.053662              (JUMBO STORAGE BAG SUKI)
9   0.052665              (JAM MAKING SET PRINTED)
10  0.053254   (JUMBO SHOPPER VINTAGE RED PAISLEY)
11  0.056608     (NATURAL SLATE HEART CHALKBOARD )
12  0.054433               (HEART OF WICKER SMALL)
13  0.094815             (JUMBO BAG RED RETROSPOT)
14  0.052438          (LUNCH BAG SPACEBOY DESIGN )
15  0.057696             (LUNCH BAG  BLACK SKULL.)
16  0.052121                 (LUNCH BAG CARS BLUE)
17  0.065899                     




Min Support = 0.02
Number of itemsets found: 303
      support                                                                                             itemsets
0    0.102429                                                                 (WHITE HANGING HEART T-LIGHT HOLDER)
1    0.020803                                                                (KNITTED UNION FLAG HOT WATER BOTTLE)
2    0.021710                                                                             (HAND WARMER UNION JACK)
3    0.065945                                                                      (ASSORTED COLOUR BIRD ORNAMENT)
4    0.035216                                                                           (HOME BUILDING BLOCK WORD)
5    0.028508                                                                           (LOVE BUILDING BLOCK WORD)
6    0.027194                                                                                (DOORMAT NEW ENGLAND)
7    0.020350                 




Min Support = 0.01
Number of itemsets found: 1472
       support                                                                                                                       itemsets
0     0.102429                                                                                           (WHITE HANGING HEART T-LIGHT HOLDER)
1     0.020803                                                                                          (KNITTED UNION FLAG HOT WATER BOTTLE)
2     0.019443                                                                                               (RED WOOLLY HOTTIE WHITE HEART.)
3     0.017223                                                                                                 (SET 7 BABUSHKA NESTING BOXES)
4     0.013914                                                                                                          (WHITE METAL LANTERN)
5     0.012872                                                                                   

***5. Generate Association Rules*** <br><br>
Use the `association_rules()` function from Mlxtend to generate association rules from the frequent itemsets.

In [7]:
# Generate association rules for minsup = 0.02
fi = fpgrowth(df_matrix, min_support=0.02, use_colnames=True)
rules = association_rules(fi, metric="confidence", min_threshold=0.85)

# Display rules
print("\nAssociation Rules with Confidence >= 0.85:")
print(rules.to_string())





Association Rules with Confidence >= 0.85:
                                                          antecedents                         consequents  antecedent support  consequent support   support  confidence       lift  representativity  leverage  conviction  zhangs_metric   jaccard  certainty  kulczynski
0  (ROSES REGENCY TEACUP AND SAUCER , PINK REGENCY TEACUP AND SAUCER)   (GREEN REGENCY TEACUP AND SAUCER)            0.027148            0.046003  0.024565    0.904841  19.669380               1.0  0.023316   10.025342       0.975647  0.505597   0.900253    0.719416
1   (PINK REGENCY TEACUP AND SAUCER, GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER )            0.028689            0.048314  0.024565    0.856240  17.722404               1.0  0.023179    6.619970       0.971444  0.468453   0.848942    0.682341


***Step 6 (Optional): Compare with Apriori***<br>

Compare the results with the `apriori()` function using the timeit library.

In [8]:
# Measure time for apriori
time_apriori = timeit.timeit(lambda: apriori(df_matrix, min_support=0.02, use_colnames=True), number=1)
print(f"\nTime taken by Apriori: {time_apriori:.4f} seconds")

# Measure time for FP-Growth
time_fpgrowth = timeit.timeit(lambda: fpgrowth(df_matrix, min_support=0.02, use_colnames=True), number=1)
print(f"Time taken by FP-Growth: {time_fpgrowth:.4f} seconds")




Time taken by Apriori: 30.9593 seconds




Time taken by FP-Growth: 8.5525 seconds
