In [47]:
#Learning how to apply association using the apriori algorithm
#Example1:Retail Dataset
#First download aprirori algoritm by doing !pip install apyori

In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.contingency import association
%matplotlib inline
from apyori import apriori

In [49]:
#Read external data(Retail dataset)
retail = pd.read_csv("retail_dataset.csv",sep=",")
retail.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [50]:
retail.isnull().sum()

0      0
1     30
2     70
3    128
4    182
5    244
6    274
dtype: int64

In [51]:
transactions = retail.apply(lambda row: row.dropna().tolist(), axis=1)
print(transactions[10])

['Cheese', 'Meat', 'Eggs', 'Milk', 'Wine']


In [52]:
#Data understanding
transactions.describe()

count                      315
unique                     157
top       [Bread, Bagel, Milk]
freq                        25
dtype: object

In [None]:
transactions.head()

In [18]:
transactions.dtypes

dtype('O')

In [19]:
#association rule
association_rules = apriori(transactions, min_support=0.0045, min_confidence=0.7, min_lift=3,min_length=2)
association_results = list(association_rules)

In [21]:
#Display the rules
print(len(association_results))

2


In [22]:
#Print the first item in the association rules
print(association_results[0])

RelationRecord(items=frozenset({'Meat', 'Milk', 'Diaper', 'Bagel', 'Bread', 'Pencil', 'Eggs'}), support=0.006349206349206349, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Milk', 'Meat', 'Diaper', 'Pencil', 'Eggs'}), items_add=frozenset({'Bagel', 'Bread'}), confidence=1.0, lift=3.5795454545454546)])


In [26]:
#Print detailed rules as shown in the worksheet example
for rule in association_results:

    # Extract support
    support = rule.support

    # Each rule may contain multiple OrderedStatistics
    for stat in rule.ordered_statistics:

        base_items = list(stat.items_base)
        add_items = list(stat.items_add)

        # Print rule
        print("Rule: " + ", ".join(base_items) + " -> " + ", ".join(add_items))

        # Print metrics
        print("Support:", support)
        print("Confidence:", stat.confidence)
        print("Lift:", stat.lift)
        print("=====================================")

Rule: Milk, Meat, Diaper, Pencil, Eggs -> Bagel, Bread
Support: 0.006349206349206349
Confidence: 1.0
Lift: 3.5795454545454546
Rule: Milk, Meat, Cheese, Diaper, Pencil -> Wine, Bread
Support: 0.009523809523809525
Confidence: 1.0
Lift: 4.090909090909091


In [23]:
#playing around with f string
for rule in association_results:
    for stat in rule.ordered_statistics:
        base_items = list(stat.items_base)
        add_items = list(stat.items_add)

        print(f"Rule: {base_items} -> {add_items}")

Rule: ['Milk', 'Meat', 'Diaper', 'Pencil', 'Eggs'] -> ['Bagel', 'Bread']
Rule: ['Milk', 'Meat', 'Cheese', 'Diaper', 'Pencil'] -> ['Wine', 'Bread']


In [25]:
#my version using dictionary and df to make it cleaner
rules_list=[]

for rule in association_results:
    for stat in rule.ordered_statistics:
        rules_list.append({
            "Base":list(stat.items_base),
            "Add":list(stat.items_add),
            "Support":rule.support,
            "Confidence":stat.confidence,
            "Lift":stat.lift,
        })

rules_df = pd.DataFrame(rules_list)
print(rules_df)


                                   Base             Add   Support  Confidence  \
0    [Milk, Meat, Diaper, Pencil, Eggs]  [Bagel, Bread]  0.006349         1.0   
1  [Milk, Meat, Cheese, Diaper, Pencil]   [Wine, Bread]  0.009524         1.0   

       Lift  
0  3.579545  
1  4.090909  


In [2]:
#Example 2:Market basket Dataset
#import dataset
market_basket = pd.read_csv("Market_Basket.csv",header=None)
num_records = len(market_basket)
print(num_records)

7501


In [3]:
#display the first few rows of the dataset
market_basket.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [4]:
#Dataset description
market_basket.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,7501,5747,4389,3345,2529,1864,1369,981,654,395,256,154,87,47,25,8,4,4,3,1
unique,115,117,115,114,110,106,102,98,88,80,66,50,43,28,19,8,3,3,3,1
top,mineral water,mineral water,mineral water,mineral water,green tea,french fries,green tea,green tea,green tea,green tea,low fat yogurt,green tea,green tea,green tea,magazines,salmon,frozen smoothie,protein bar,spinach,olive oil
freq,577,484,375,201,153,107,96,67,57,31,22,15,8,4,3,1,2,2,1,1


In [5]:
#display shape
market_basket.shape

(7501, 20)

In [7]:
records = []
for i in range(len(market_basket)):
    row = []
    for j in range(7):
        val = market_basket.values[i, j]
        if pd.notna(val):       # keep only real values
            row.append(str(val))
    records.append(row)

In [8]:
print("Unique items in records:", len(set(item for row in records for item in row)))

Unique items in records: 119


In [11]:
set_records = set(item for row in records for item in row)
set_mb = set(item for row in mb_processed for item in row)

print("Items in mb_processed but NOT in records:")
print(set_mb - set_records)

print("Items in records but NOT in mb_processed:")
print(set_records - set_mb)

Items in mb_processed but NOT in records:
{' asparagus'}
Items in records but NOT in mb_processed:
set()


In [15]:
#Data preprocessing convert data
mb_processed = market_basket.apply(
    lambda row: [str(x).strip() for x in row.dropna()],axis=1).tolist()
#print([mb_processed[3]])

In [16]:
print("Unique items in mb_processed:",len(set(item for row in mb_processed for item in row)))


Unique items in mb_processed: 119


In [17]:
#Applying Apirori Algoritm, association rule apriori
association_rules = apriori(mb_processed, min_support=0.006,min_confidence=0.2,min_lift=3,min_length=2)
association_results = list(association_rules)

#print the total no.of rules mined by the apriori class
print(len(association_results))

8


In [18]:
#print the first item in the association rules
print(association_results[0])

RelationRecord(items=frozenset({'ground beef', 'herb & pepper'}), support=0.015997866951073192, ordered_statistics=[OrderedStatistic(items_base=frozenset({'herb & pepper'}), items_add=frozenset({'ground beef'}), confidence=0.3234501347708895, lift=3.2919938411349285)])


In [20]:
#Print detailed rules
rules_list=[]

for rule in association_results:
    for stat in rule.ordered_statistics:
        rules_list.append({
            "Antecedent":list(stat.items_base),
            "Consequent":list(stat.items_add),
            "Support":rule.support,
            "Confidence":stat.confidence,
            "Lift":stat.lift,
        })

rules_df = pd.DataFrame(rules_list)
print(rules_df)

                       Antecedent           Consequent   Support  Confidence  \
0                 [herb & pepper]        [ground beef]  0.015998    0.323450   
1             [whole wheat pasta]          [olive oil]  0.007999    0.271493   
2  [frozen vegetables, spaghetti]        [ground beef]  0.008666    0.311005   
3         [shrimp, mineral water]  [frozen vegetables]  0.007199    0.305085   
4  [frozen vegetables, spaghetti]           [tomatoes]  0.006666    0.239234   
5           [tomatoes, spaghetti]  [frozen vegetables]  0.006666    0.318471   
6  [mineral water, herb & pepper]        [ground beef]  0.006666    0.390625   
7      [spaghetti, herb & pepper]        [ground beef]  0.006399    0.393443   
8               [milk, spaghetti]          [olive oil]  0.007199    0.203008   

       Lift  
0  3.291994  
1  4.122410  
2  3.165328  
3  3.200616  
4  3.498046  
5  3.341054  
6  3.975683  
7  4.004360  
8  3.082509  
