In [20]:
import numpy as np
import pandas as pd
from mlxtend import *
from mlxtend.frequent_patterns import *
import timeit


In [7]:
def list_of_list_of_items(retails: pd.DataFrame):
    groups = []
    grouped_retails = retails.groupby('InvoiceNo')['Description'].apply(list)
    for i in range(0, len(grouped_retails)):
        groups.append(grouped_retails[i])
        # print(groups[i])
    return groups, grouped_retails

In [8]:
def compute_pa_matrix(invoice_no: list, all_items: list, grouped_retails: pd.DataFrame):
    pa_matrix = np.zeros((len(invoice_nos), len(all_items)))
    for i in range(0, len(invoice_nos)):
        for j in range(0, len(all_items)):
            if all_items[j] in grouped_retails[i]:
                pa_matrix[i, j] = 1
            # print(f"{pa_matrix[i, j]} ", end='')
        # print("")
    return pa_matrix

In [12]:
if __name__ == '__main__':
    init_retails = pd.read_csv("retails.csv")
    retails = (init_retails[~init_retails['InvoiceNo'].str.contains('C', case=False)]).reset_index(drop=True)
    # print(retails.to_string())

In [13]:
    groups, grouped_retails = list_of_list_of_items(retails)

In [16]:
    all_items = list(retails['Description'].drop_duplicates())
    # print(all_items)
    invoice_nos = list(retails['InvoiceNo'].drop_duplicates())
    invoice_nos.sort()
    # print(invoice_nos)
    pa_matrix = compute_pa_matrix(invoice_nos, all_items, grouped_retails)
    df = pd.DataFrame(data=pa_matrix, columns=all_items)


In [17]:
    fi = fpgrowth(df, 0.02)
    print(f"Number of itemsets: {len(fi)}")
    print(fi.to_string())



303
      support          itemsets
0    0.102429               (0)
1    0.020803               (3)
2    0.021710               (7)
3    0.065945               (9)
4    0.035216              (17)
5    0.028508              (18)
6    0.027194              (20)
7    0.020350              (12)
8    0.051351              (21)
9    0.051033              (45)
10   0.047634              (27)
11   0.044416              (28)
12   0.040790              (35)
13   0.038479              (39)
14   0.035488              (26)
15   0.033176              (34)
16   0.032043              (36)
17   0.027556              (32)
18   0.052574              (46)
19   0.049855              (54)
20   0.044008              (55)
21   0.024610              (52)
22   0.033584              (58)
23   0.029142              (57)
24   0.070885              (66)
25   0.059826              (69)
26   0.055203              (59)
27   0.053662              (76)
28   0.042286              (60)
29   0.039385              (71)
30  

In [18]:
    ar = association_rules(fi, metric='confidence', min_threshold=0.85, support_only=False)
    print(f"Number of association rules: {len(ar)}")
    print(ar.to_string())

2
   antecedents consequents  antecedent support  consequent support   support  confidence       lift  leverage  conviction
0  (2344, 619)       (624)            0.027148            0.046003  0.024565    0.904841  19.669380  0.023316   10.025342
1  (2344, 624)       (619)            0.028689            0.048314  0.024565    0.856240  17.722404  0.023179    6.619970


In [23]:
    ap = apriori(df, 0.02)
    print(f"Number of itemsets: {len(ap)}")
    print(ap.to_string())



303
      support          itemsets
0    0.102429               (0)
1    0.020803               (3)
2    0.021710               (7)
3    0.065945               (9)
4    0.020350              (12)
5    0.035216              (17)
6    0.028508              (18)
7    0.027194              (20)
8    0.051351              (21)
9    0.035488              (26)
10   0.047634              (27)
11   0.044416              (28)
12   0.027556              (32)
13   0.033176              (34)
14   0.040790              (35)
15   0.032043              (36)
16   0.038479              (39)
17   0.051033              (45)
18   0.052574              (46)
19   0.024610              (52)
20   0.049855              (54)
21   0.044008              (55)
22   0.029142              (57)
23   0.033584              (58)
24   0.055203              (59)
25   0.042286              (60)
26   0.032723              (62)
27   0.023885              (65)
28   0.070885              (66)
29   0.026061              (67)
30  

In [24]:
    ar1 = association_rules(ap, metric='confidence', min_threshold=0.85, support_only=False)
    print(f"Number of association rules: {len(ar1)}")
    print(ar1.to_string())

2
   antecedents consequents  antecedent support  consequent support   support  confidence       lift  leverage  conviction
0  (624, 2344)       (619)            0.028689            0.048314  0.024565    0.856240  17.722404  0.023179    6.619970
1  (2344, 619)       (624)            0.027148            0.046003  0.024565    0.904841  19.669380  0.023316   10.025342


In [25]:
    time_fpg = timeit.timeit(lambda: fpgrowth(df, 0.02), number=1)
    print(f"time spent by fpgrowth algorithm: {time_fpg}")



time spent by fpgrowth algorithm: 2.720630699999674


In [26]:
    time_apr = timeit.timeit(lambda: apriori(df, 0.02), number=1)
    print(f"time spent by apriori algorithm: {time_apr}")



time spent by apriori algorithm: 71.52641889999995
