In [1]:
# In this notebook:



In [2]:
import numpy as np
import pandas as pd
from apyori import apriori
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
tut_url = 'https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python/'

In [4]:
sorig = pd.read_csv('store_data.csv', header=None)

print(f'Dataset shape is {sorig.shape}')
sorig.head()
# dataset is not very long

Dataset shape is (7501, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [5]:
# Apriori library we are going to use requires our dataset to be in the form of a list of lists,
records = []  
for i in range(0, sorig.shape[0]):  
    records.append([str(sorig.values[i,j]) for j in range(0, sorig.shape[1])])
    
records[:2]

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers',
  'meatballs',
  'eggs',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan']]

In [6]:
# min_support - select the items with support values greater than the value specified by the parameter.
# min_confidence - filters those rules that have confidence greater than the confidence threshold specified by the parameter
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)  
print(f'apriori returns type {type(association_rules)}')

apriori returns type <class 'generator'>


In [7]:
# example of apriori output
association_results[0]

RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)])

In [8]:
for item in association_results:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: light cream -> chicken
Support: 0.004532728969470737
Confidence: 0.29059829059829057
Lift: 4.84395061728395
Rule: escalope -> mushroom cream sauce
Support: 0.005732568990801226
Confidence: 0.3006993006993007
Lift: 3.790832696715049
Rule: escalope -> pasta
Support: 0.005865884548726837
Confidence: 0.3728813559322034
Lift: 4.700811850163794
Rule: herb & pepper -> ground beef
Support: 0.015997866951073192
Confidence: 0.3234501347708895
Lift: 3.2919938411349285
Rule: ground beef -> tomato sauce
Support: 0.005332622317024397
Confidence: 0.3773584905660377
Lift: 3.840659481324083
Rule: whole wheat pasta -> olive oil
Support: 0.007998933475536596
Confidence: 0.2714932126696833
Lift: 4.122410097642296
Rule: pasta -> shrimp
Support: 0.005065991201173177
Confidence: 0.3220338983050847
Lift: 4.506672147735896
Rule: light cream -> nan
Support: 0.004532728969470737
Confidence: 0.29059829059829057
Lift: 4.84395061728395
Rule: chocolate -> frozen vegetables
Support: 0.005332622317024397
Confide

In [9]:
# Data explanation from tutorial:

# The second rule states that mushroom cream sauce and escalope are bought frequently. 
# The support for mushroom cream sauce is 0.0057. 
# The confidence for this rule is 0.3006 which means that out of all the transactions 
# containing mushroom, 30.06% of the transactions are likely to contain escalope as well. 
# Finally, lift of 3.79 shows that the escalope is 3.79 more likely to be bought by the 
# customers that buy mushroom cream sauce, compared to its default sale.

In [10]:
# min_support - select the items with support values greater than the value specified by the parameter.
# min_confidence - filters those rules that have confidence greater than the confidence threshold specified by the parameter
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)  
print(f'apriori returns type {type(association_rules)}')

apriori returns type <class 'generator'>


## Experimentation

In [11]:
sorig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [12]:
unique_items = []  
for i in range(0, sorig.shape[0]):
    for j in range(0, sorig.shape[1]):
        unique_items.append(str(sorig.values[i,j]))
                             
unique_items = set(unique_items)

print(f'There are {len(unique_items)} unique items')

There are 121 unique items


In [13]:
# scaling up parameters yield fewer outputs
# items_base as associated with items_add
# high confidence threshold filter out the flipped versions of association
new_association_rules = apriori(records, min_support=0.004, min_confidence=0.4, min_lift=3)
new_association_results = list(new_association_rules)

print(f'Length of new asso: {len(new_association_results)}\n')
print(new_association_results[0])
print('\n')
print(new_association_results[1])

Length of new asso: 8

RelationRecord(items=frozenset({'cooking oil', 'ground beef', 'spaghetti'}), support=0.004799360085321957, ordered_statistics=[OrderedStatistic(items_base=frozenset({'cooking oil', 'ground beef'}), items_add=frozenset({'spaghetti'}), confidence=0.5714285714285714, lift=3.2819951870487856)])


RelationRecord(items=frozenset({'milk', 'frozen vegetables', 'olive oil'}), support=0.004799360085321957, ordered_statistics=[OrderedStatistic(items_base=frozenset({'frozen vegetables', 'olive oil'}), items_add=frozenset({'milk'}), confidence=0.4235294117647058, lift=3.2684095860566447)])


In [14]:
# This concludes the tutorial!