<a href="https://colab.research.google.com/github/varunchandra10/datamining_algorithms/blob/main/Association_rule_mining/Apriori_Algo/Apriori_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install the below packages

- Pandas
- mlxtend

`    pip install pandas mlxtend    `

Download the dataset: **grocery_dataset.csv** from github and place it in the **Files(which will be in sidebar of Colab)** in files
- navigate to sample_data
- right click on it and
- click on upload
- After uploading, right click on csv file and copy path and paste wherever it is required.


# Implementation of Apriori Algorithm without Libraries

In [117]:
import pandas as pd
from itertools import combinations, chain
from collections import defaultdict

In [118]:
def load_transactions(df):
    transactions = []
    if df.shape[1] == 1:
        for i in range(len(df)):
            row = str(df.iloc[i, 0])
            if "[" in row:
                row = row.strip("[]").replace("'", "")
                items = row.split(",")
            else:
                items = row.split(",")
            cleaned = []
            for item in items:
                if item.strip():
                    cleaned.append(item.strip())
            transactions.append(cleaned)
    else:
        for i in range(len(df)):
            row = df.iloc[i].dropna().tolist()
            cleaned = []
            for item in row:
                if str(item).strip():
                    cleaned.append(str(item).strip())
            transactions.append(cleaned)
    return transactions

In [119]:
# Generate C1 - Initial candidate itemsets
def create_C1(transactions):
    item_count = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_count[frozenset([item])] += 1
    return dict(item_count)

In [120]:
# Filter based on support threshold
def filter_candidates(candidates, min_support_count):
    return {itemset: count for itemset, count in candidates.items() if count >= min_support_count}

In [121]:
# Generate Ck from Lk-1
def generate_candidates(prev_Lk, k):
    prev_Lk_list = list(prev_Lk)
    candidates = set()
    for i in range(len(prev_Lk_list)):
        for j in range(i + 1, len(prev_Lk_list)):
            union = prev_Lk_list[i].union(prev_Lk_list[j])
            if len(union) == k:
                subsets = list(combinations(union, k - 1))
                if all(frozenset(sub) in prev_Lk for sub in subsets):
                    candidates.add(frozenset(union))
    return candidates

In [122]:
# Count support for candidates
def count_candidates(transactions, candidates):
    item_count = defaultdict(int)
    for transaction in transactions:
        t_set = set(transaction)
        for candidate in candidates:
            if candidate.issubset(t_set):
                item_count[candidate] += 1
    return dict(item_count)

In [123]:
# Generate association rules with support counted manually if missing
def generate_rules(frequent_itemsets, transactions, min_confidence_percent):
    rules = []
    for itemset, itemset_count in frequent_itemsets.items():
        if len(itemset) < 2:
            continue
        items = list(itemset)
        for i in range(1, len(items)):
            for antecedent in combinations(items, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent

                if not consequent:
                    continue

                # Manually count support of antecedent if not found
                antecedent_count = frequent_itemsets.get(antecedent)
                if antecedent_count is None:
                    antecedent_count = sum(1 for t in transactions if antecedent.issubset(set(t)))

                confidence = (itemset_count / antecedent_count) * 100
                rule = (set(antecedent), set(consequent), round(confidence, 2))
                rules.append(rule)

    strong_rules = [r for r in rules if r[2] >= min_confidence_percent]
    return rules, strong_rules

In [124]:
# Main Apriori function
def apriori(transactions, min_support_count, min_confidence_percent):
    print(f"✅ Minimum Support Count: {min_support_count}\n")

    C1 = create_C1(transactions)
    print("🔹 C1 - Candidate 1-itemsets:")
    print(pd.DataFrame(C1.items(), columns=["Itemset", "Support Count"]), "\n")

    L1 = filter_candidates(C1, min_support_count)
    print("✅ L1 - Frequent 1-itemsets:")
    print(pd.DataFrame(L1.items(), columns=["Itemset", "Support Count"]), "\n")

    L = L1
    k = 2
    all_frequent_itemsets = dict(L1)

    while L:
        Ck = generate_candidates(set(L.keys()), k)
        Ck_counts = count_candidates(transactions, Ck)
        print(f"🔹 C{k} - Candidate {k}-itemsets:")
        print(pd.DataFrame(Ck_counts.items(), columns=["Itemset", "Support Count"]), "\n")

        Lk = filter_candidates(Ck_counts, min_support_count)
        if not Lk:
            break
        print(f"✅ L{k} - Frequent {k}-itemsets:")
        print(pd.DataFrame(Lk.items(), columns=["Itemset", "Support Count"]), "\n")

        all_frequent_itemsets.update(Lk)
        L = Lk
        k += 1

    all_rules, strong_rules = generate_rules(all_frequent_itemsets, transactions, min_confidence_percent)
    all_rules.sort(key=lambda x: (x[2], len(x[0])), reverse=True)

    print("📋 All Association Rules:")
    print(pd.DataFrame(all_rules, columns=["Antecedent", "Consequent", "Confidence (%)"]))

    print(f"\n🔥 Strong Association Rules (Confidence ≥ {min_confidence_percent}%):")
    print(pd.DataFrame(strong_rules, columns=["Antecedent", "Consequent", "Confidence (%)"]))

    return all_frequent_itemsets, all_rules, strong_rules

In [125]:
file_path = "/content/sample_data/grocery_dataset.csv" #file path is required
df = pd.read_csv(file_path)
transactions = load_transactions(df)
print(f"\n📦 Total Transactions: {len(transactions)}")
print("🛒 Sample Transactions:")
for t in transactions[:3]:
    print("  ", t)


📦 Total Transactions: 60
🛒 Sample Transactions:
   ['Milk', 'Cheese']
   ['Eggs', 'Butter', 'Bread']
   ['Cereal', 'Apples']


In [126]:
# Parameters
min_support_count = 3
min_confidence = 70

# Run Apriori
frequent_itemsets, all_rules, strong_rules = apriori(transactions, min_support_count, min_confidence)

✅ Minimum Support Count: 3

🔹 C1 - Candidate 1-itemsets:
     Itemset  Support Count
0     (Milk)             22
1   (Cheese)             18
2     (Eggs)             23
3   (Butter)             18
4    (Bread)             22
5   (Cereal)             19
6   (Apples)             24
7   (Yogurt)             21
8    (Juice)             13
9  (Bananas)             18 

✅ L1 - Frequent 1-itemsets:
     Itemset  Support Count
0     (Milk)             22
1   (Cheese)             18
2     (Eggs)             23
3   (Butter)             18
4    (Bread)             22
5   (Cereal)             19
6   (Apples)             24
7   (Yogurt)             21
8    (Juice)             13
9  (Bananas)             18 

🔹 C2 - Candidate 2-itemsets:
              Itemset  Support Count
0      (Milk, Cheese)              5
1       (Eggs, Bread)              7
2     (Butter, Bread)              7
3      (Butter, Eggs)              7
4    (Cereal, Apples)              8
5       (Milk, Bread)              7
6      

# Implementation of Apriori algorithm with Libraries

In [127]:
file_path_lib = "/content/sample_data/grocery_dataset.csv" #file path is required
df_lib = pd.read_csv(file_path_lib)
print(f"\n📦 Total Transactions: {len(df_lib)}")
for t in transactions[:]:
    print("  ", t)


📦 Total Transactions: 60
   ['Milk', 'Cheese']
   ['Eggs', 'Butter', 'Bread']
   ['Cereal', 'Apples']
   ['Milk', 'Bread']
   ['Eggs', 'Yogurt', 'Milk']
   ['Yogurt', 'Apples', 'Eggs']
   ['Cereal', 'Cheese', 'Milk', 'Apples', 'Bread']
   ['Juice', 'Cheese', 'Butter', 'Bread', 'Bananas']
   ['Bread', 'Apples']
   ['Juice', 'Cereal']
   ['Milk', 'Bananas', 'Bread', 'Eggs']
   ['Yogurt', 'Cheese']
   ['Cereal', 'Eggs', 'Bread', 'Milk']
   ['Cheese', 'Bread', 'Eggs']
   ['Apples', 'Cheese']
   ['Juice', 'Butter', 'Cereal', 'Yogurt', 'Bread']
   ['Bread', 'Butter', 'Eggs', 'Cereal']
   ['Apples', 'Cheese', 'Eggs', 'Juice', 'Butter']
   ['Eggs', 'Milk']
   ['Apples', 'Cheese', 'Bread', 'Bananas']
   ['Eggs', 'Bananas', 'Apples', 'Juice']
   ['Butter', 'Cheese', 'Cereal', 'Bread', 'Juice']
   ['Cereal', 'Apples', 'Yogurt', 'Butter']
   ['Butter', 'Yogurt', 'Bananas']
   ['Milk', 'Bread']
   ['Butter', 'Apples', 'Bread']
   ['Apples', 'Bananas', 'Cheese', 'Yogurt', 'Milk']
   ['Yogurt', 'Che

In [129]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd

# STEP 3: One-hot encoding
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_lib = pd.DataFrame(te_array, columns=te.columns_)

# STEP 4: Run Apriori Algorithm
min_support = 0.04     # e.g. 4% of total transactions
min_confidence = 0.7   # 70%

frequent_itemsets = apriori(df_lib, min_support=min_support, use_colnames=True)
frequent_itemsets['support_count'] = (frequent_itemsets['support'] * len(transactions)).astype(int)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# STEP 5: Optional rule filtering
# Example: Only show rules with 2+ items in antecedents
filtered_rules = rules[
    (rules['antecedents'].apply(lambda x: len(x) >= 1)) &
    (rules['confidence'] >= min_confidence)
]

# STEP 6: Show Results
print("\n✅ Frequent Itemsets:")
print(frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True))

print("\n🔗 Association Rules (Support & Confidence Only):")
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence']]
      .sort_values(by='confidence', ascending=False)
      .reset_index(drop=True))


✅ Frequent Itemsets:
     support                  itemsets  support_count
0   0.400000                  (Apples)             24
1   0.383333                    (Eggs)             23
2   0.366667                    (Milk)             22
3   0.366667                   (Bread)             22
4   0.350000                  (Yogurt)             21
..       ...                       ...            ...
73  0.050000       (Milk, Eggs, Bread)              3
74  0.050000  (Butter, Cereal, Yogurt)              3
75  0.050000   (Butter, Juice, Cheese)              3
76  0.050000    (Milk, Cereal, Cheese)              3
77  0.050000      (Milk, Cereal, Eggs)              3

[78 rows x 3 columns]

🔗 Association Rules (Support & Confidence Only):
         antecedents consequents   support  confidence
0     (Juice, Bread)    (Butter)  0.050000        1.00
1  (Bananas, Cereal)    (Apples)  0.066667        0.80
2  (Bananas, Cheese)    (Apples)  0.050000        0.75
3  (Bananas, Cheese)     (Bread)  0.0