## association rule mining on synthetic data sets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

### generate syntheic data

In [2]:
sparsity = 0.0003
num_items = 10000  # 1000
num_transactions = 10000  # 1000
np.random.seed(42)
transf_trans_ndarray = np.random.choice([0, 1], size=(num_transactions, num_items), p=[1 - sparsity, sparsity])
print(transf_trans_ndarray.sum().sum()/num_transactions)
transf_trans_ndarray.shape

3.0166


(10000, 10000)

In [3]:
df = pd.DataFrame(transf_trans_ndarray, columns = ['Item_' + str(i) for i in range(transf_trans_ndarray.shape[1])],
                 index = ['TID_' + str(i) for i in range(transf_trans_ndarray.shape[0])])
df.head()

Unnamed: 0,Item_0,Item_1,Item_2,Item_3,Item_4,Item_5,Item_6,Item_7,Item_8,Item_9,...,Item_9990,Item_9991,Item_9992,Item_9993,Item_9994,Item_9995,Item_9996,Item_9997,Item_9998,Item_9999
TID_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TID_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TID_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TID_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TID_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### inject some patterns

#### make some columns frequent itemsets

In [4]:
inject_pattern_1 = False

In [5]:
# make column Item_6 be, on average freq * 100% 1
if inject_pattern_1:
    freq_items_list = ['Item_6']  # , 'Item_8']  # make random
    freq = [0.8]  # , 0.6]  # make random
    for freq, freq_item in zip(freq, freq_items_list):
        freq_column_vector = np.random.choice([0, 1], size=(df.shape[0], ), p=[1 - freq, freq])
        df.loc[:, freq_item] = freq_column_vector

#### create a pattern off the frequent columns

In [6]:
inject_pattern_2 = False

In [7]:
# make column Item_9 replicate the 1 values in column Item_6 freq_pattern * 100% of the time
freq_pattern = 0.5
if inject_pattern_2:
    for freq_item in freq_items_list:
        df.loc[:, 'Item_9'] = np.where(df[freq_item] == 1, 
                                       np.random.choice([0, 1], size=(df.shape[0], ), 
                                                        p=[1 - freq_pattern, freq_pattern]), 
                                       0)

### generate frequent itemsets

In [8]:
min_support = 0.01  # 0.5
max_len = 2
frequent_itemsets = apriori(df.astype(bool), min_support = min_support, max_len = max_len, use_colnames = True)
frequent_itemsets

Unnamed: 0,support,itemsets


### generate association rules

In [9]:
min_threshold = 0.80
rules = association_rules(frequent_itemsets, metric = "confidence", min_threshold = min_threshold)
rules

ValueError: The input DataFrame `df` containing the frequent itemsets is empty.