In [1]:
import pandas as pd
from itertools import chain, combinations

In [35]:
data = pd.read_csv('Dataset/basket.csv')

def preprocess_data(df):
    transactions = []
    for index, row in df.iterrows():
        transaction = set(row.dropna().unique())
        if transaction:
            transactions.append(transaction)
    return transactions

transactions = preprocess_data(data)
transactions[:5]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'}]

In [42]:
def apriori(transactions, min_support):

    def find_frequent_1_itemsets(transactions, min_support):
    # Initialize a dictionary to store the count of each item
        item_count = {}
        for transaction in transactions:
            for item in transaction:
                if item in item_count:
                    item_count[item] += 1
                else:
                    item_count[item] = 1
        
        # Calculate the total number of transactions
        total_transactions = len(transactions)
        
        # Initialize a list to store the frequent 1-itemsets
        frequent_1_itemsets = []
        
        # Identify the frequent 1-itemsets
        for item, count in item_count.items():
            support = count / total_transactions
            if support >= min_support:
                frequent_1_itemsets.append({item})
        
        return frequent_1_itemsets

    # Step 1: Find frequent 1-itemsets
    frequent_itemsets = find_frequent_1_itemsets(transactions, min_support)
    all_frequent_itemsets = {1: frequent_itemsets}
    
    k = 2
    while frequent_itemsets:
        # Step 2: Generate candidate k-itemsets
        candidate_itemsets = set()
        for combination in combinations(frequent_itemsets, 2):
            union_set = combination[0].union(combination[1])
            if len(union_set) == k:
                candidate_itemsets.add(frozenset(union_set))
        
        # Step 3: Prune candidate k-itemsets based on subsets
        pruned_candidates = set()
        for candidate in candidate_itemsets:
            is_frequent = True
            for subset in combinations(candidate, k-1):
                if set(subset) not in frequent_itemsets:
                    is_frequent = False
                    break
            if is_frequent:
                pruned_candidates.add(candidate)
        
        # Step 4: Count the support of each pruned candidate itemset
        candidate_support = {itemset: 0 for itemset in pruned_candidates}
        for transaction in transactions:
            for candidate in pruned_candidates:
                if candidate.issubset(transaction):
                    candidate_support[candidate] += 1
        
        # Step 5: Identify the frequent k-itemsets
        total_transactions = len(transactions)
        frequent_itemsets = []
        for itemset, count in candidate_support.items():
            support = count / total_transactions
            if support >= min_support:
                frequent_itemsets.append(set(itemset))
        
        # Step 6: Add the frequent k-itemsets to the list of all frequent itemsets
        if frequent_itemsets:
            all_frequent_itemsets[k] = frequent_itemsets
        k += 1  
    
    return all_frequent_itemsets

In [45]:
min_support = 0.01

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(transactions, min_support)

# Display the frequent itemsets
for k, itemsets in frequent_itemsets.items():
    print(f"Frequent {k}-itemsets: {itemsets}\n")

Frequent 1-itemsets: [{'salty snack'}, {'whole milk'}, {'pastry'}, {'sausage'}, {'yogurt'}, {'soda'}, {'canned beer'}, {'misc. beverages'}, {'hygiene articles'}, {'rolls/buns'}, {'frankfurter'}, {'whipped/sour cream'}, {'curd'}, {'beef'}, {'white bread'}, {'butter'}, {'frozen vegetables'}, {'other vegetables'}, {'sugar'}, {'tropical fruit'}, {'specialty chocolate'}, {'butter milk'}, {'frozen meals'}, {'root vegetables'}, {'pip fruit'}, {'chocolate'}, {'red/blush wine'}, {'shopping bags'}, {'margarine'}, {'bottled water'}, {'chicken'}, {'bottled beer'}, {'dessert'}, {'hamburger meat'}, {'domestic eggs'}, {'white wine'}, {'newspapers'}, {'herbs'}, {'coffee'}, {'UHT-milk'}, {'specialty bar'}, {'sliced cheese'}, {'candy'}, {'citrus fruit'}, {'grapes'}, {'brown bread'}, {'processed cheese'}, {'onions'}, {'hard cheese'}, {'napkins'}, {'meat'}, {'fruit/vegetable juice'}, {'soft cheese'}, {'oil'}, {'long life bakery product'}, {'beverages'}, {'berries'}, {'cream cheese '}, {'ham'}, {'pork'}, {