## Apriori Algorithm

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations

In [5]:
def print_frozenset(set_of_sets):
    formatted_set_of_sets = "{{" + "}, {".join(", ".join(map(str, inner_set)) for inner_set in set_of_sets) + "}}"
    print(formatted_set_of_sets)

In [2]:
df = pd.read_csv("data.txt")
df

Unnamed: 0,TransactionId,Items
0,1,ACD
1,2,BCE
2,3,ABCE
3,4,BE


In [3]:
transactions = []

for i in range(0, df.shape[0]):
    transactions.append(df.iloc[i, 1])
    
print(transactions)

['ACD', 'BCE', 'ABCE', 'BE']


In [4]:
item_counts = defaultdict(int)
for items in transactions:
    for item in items:
        item_counts[item] += 1

pd.DataFrame.from_dict(item_counts, orient='index', columns=['Freq'])

Unnamed: 0,Freq
A,2
C,3
D,1
B,3
E,3


In [6]:
min_support = 2

frequent_1_itemsets = {frozenset([item]) for item, count in item_counts.items() if count >= min_support}
print("1_itemset : ")
print_frozenset(frequent_1_itemsets)

1_itemset : 
{{A}, {B}, {E}, {C}}


In [7]:
def generate_candidates(prev_itemsets, k):
    candidates = set()
    
    for itemset1 in prev_itemsets:
        for itemset2 in prev_itemsets:
            if len(itemset1.union(itemset2)) == k:
                candidates.add(itemset1.union(itemset2))
                
    return candidates

In [8]:
def prune_itemsets(candidate_itemsets, prev_frequent_itemsets):
    pruned_itemsets = set()
    
    for itemset in candidate_itemsets:
        subsets = list(combinations(itemset, len(itemset) - 1))
        is_frequent = True
        for subset in subsets:
            if frozenset(subset) not in prev_frequent_itemsets:
                is_frequent = False
                break
        if is_frequent:
            pruned_itemsets.add(itemset)
            
    return pruned_itemsets

In [37]:
def calculate_support_counts(transactions, itemsets):
    item_counts = defaultdict(int)
    
    for itemset in itemsets:
        for transaction in transactions:
            if itemset.issubset(set(transaction)):
                item_counts[itemset] += 1
    
    return item_counts

In [42]:
k = 2
frequent_itemsets = frequent_1_itemsets

while True:
    candidate_itemsets = generate_candidates(frequent_itemsets, k)
    print(f"{k}_itemset : ")
    print_frozenset(candidate_itemsets)
    
    # prune the itemsets     
    candidate_itemsets = prune_itemsets(candidate_itemsets, frequent_itemsets)
    print("\npruned itemset : ")
    print_frozenset(candidate_itemsets)
    
    # Count item frequencies in transactions
    item_counts = calculate_support_counts(transactions, candidate_itemsets)
    print("\nfrequency of itemsets : ")
    print(pd.DataFrame.from_dict(item_counts, orient='index', columns=['freq']))
    
    # Find frequent k-itemsets
    frequent_k_itemsets = {itemset for itemset, count in item_counts.items() if count >= min_support}
    print(f"\nfrequent_{k}_itemsets : ") 
    print_frozenset(frequent_k_itemsets)
    print("-" * 40)
    
    if len(frequent_k_itemsets) == 0:
        break

    frequent_itemsets.update(frequent_k_itemsets)
    k += 1

2_itemset : 
{{E, A}, {C, B}, {B, A}, {C, A}, {B, E}, {C, E}}

pruned itemset : 
{{E, A}, {C, B}, {B, A}, {C, A}, {B, E}, {C, E}}

frequency of itemsets : 
        freq
(E, A)     1
(C, B)     2
(B, A)     1
(C, A)     2
(B, E)     3
(C, E)     2

frequent_2_itemsets : 
{{C, A}, {C, B}, {B, E}, {C, E}}
----------------------------------------
3_itemset : 
{{E, B, A}, {C, B, A}, {C, B, E}, {C, E, A}}

pruned itemset : 
{{C, B, E}}

frequency of itemsets : 
           freq
(C, B, E)     2

frequent_3_itemsets : 
{{C, B, E}}
----------------------------------------
4_itemset : 
{{C, B, A, E}}

pruned itemset : 
{{}}

frequency of itemsets : 
Empty DataFrame
Columns: [freq]
Index: []

frequent_4_itemsets : 
{{}}
----------------------------------------


In [43]:
print("Frequent Itemsets:")
print_frozenset(frequent_itemsets)

Frequent Itemsets:
{{A}, {C}, {C, B}, {C, A}, {B, E}, {B}, {C, B, E}, {C, E}, {E}}
