In [None]:
#from mlxtend.frequent_patterns import apriori
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from scipy.io import arff
data, meta = arff.loadarff('./data/supermarket.arff')

data

In [None]:
supermarket_one_hot = pd.get_dummies(pd.DataFrame(data))
# find cols with interrogation marks
cols_with_interrogation_mark = supermarket_one_hot.filter(regex='\'\?\'$',axis=1).columns # Fix regex as it did not match any cols on my environment...
# delete columsn with interrogation marks in it
supermarket_one_hot.drop(cols_with_interrogation_mark,axis=1,inplace=True)

In [None]:
from mlxtend.frequent_patterns import apriori
apriori_result = apriori(supermarket_one_hot, min_support=0.1)
print('apriori_result n of rows : {}'.format(apriori_result.shape[0]))

In [None]:
# generate a bar chart, itemsets frequencies in terms of items containes
import matplotlib
import matplotlib.pyplot as plt

max_itemsets_len = apriori_result.itemsets.map(len).max()
frequencies = {}

for i in range(1, max_itemsets_len + 1):
   frequencies[str(i)] = apriori_result.loc[map(lambda x: len(x) == i, apriori_result['itemsets'])].shape[0]

x = np.arange(len(frequencies.keys()))  # the label locations
width = 0.35  # the width of the bars
fig, ax = plt.subplots()

rects = ax.bar(x - width/2, frequencies.values(), width, label='itemsets')

# Add value to each bar
for rect in rects:
    height = rect.get_height()
    ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Frequency')
ax.set_xlabel('Items contained')
ax.set_title('itemsets frequencies')
ax.set_xticks(x)
ax.set_xticklabels(frequencies.keys())
ax.legend()
plt.savefig('fig/itemsets_frequencies')
plt.close(fig)

In [None]:
# build association rules
from mlxtend.frequent_patterns import association_rules
min_threshold = 0.7
dataset_association_rules = association_rules(apriori_result, min_threshold=min_threshold)
print('Number of association rules : {}'.format(dataset_association_rules.shape[0]))
print('-------------')
print('First association rule : ')
print(dataset_association_rules.iloc[0])

In [None]:
# rules with 5 items (4 antecedents, 1 consequent)

# select those with 4 antecedents
rules_with_5_items = dataset_association_rules.loc[map(lambda x: len(x) == 4, dataset_association_rules['antecedents'])]
rules_with_5_items = rules_with_5_items.loc[map(lambda x: len(x) == 1, rules_with_5_items['consequents'])]

print('Number of rules with 5 items : {}'.format(rules_with_5_items.shape[0]))

In [None]:
rules_with_5_items.describe()

In [None]:
# find best rules for each metrics
metrics = ('confidence', 'lift', 'leverage', 'conviction')

for metric in metrics:
    print('--------------------------------------------------------------------------')
    print('Best association rule for metric {}'.format(metric))
    print('-----------------------------------------------------')
    print(dataset_association_rules.iloc[dataset_association_rules[metric].idxmax()])
    print('--------------------------------------------------------------------------')
