In [1]:
from collections import defaultdict
import itertools

def hash_level1(i, j, num_buckets):
    """Hash function for level 1: (i * j) % num_buckets."""
    return (i * j) % num_buckets

def hash_level2(i, j, num_buckets):
    """Hash function for level 2: (i + j) % num_buckets."""
    return (i + j) % num_buckets

def apriori_with_hashing(transactions, min_support, num_buckets):
    """Find frequent itemsets using Apriori with hashing."""
    # Count single item frequencies
    item_count = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_count[item] += 1
    frequent_items = {item for item, count in item_count.items() if count >= min_support}

    # Filter transactions to include only frequent items
    filtered_transactions = [[item for item in transaction if item in frequent_items] for transaction in transactions]

    # Generate candidate pairs and count using hashing
    bucket_count_level1 = [0] * num_buckets
    bucket_count_level2 = [0] * num_buckets
    for transaction in filtered_transactions:
        for item1, item2 in itertools.combinations(transaction, 2):
            bucket_count_level1[hash_level1(item1, item2, num_buckets)] += 1
            bucket_count_level2[hash_level2(item1, item2, num_buckets)] += 1

    # Count frequent pairs
    pair_count = defaultdict(int)
    for transaction in filtered_transactions:
        for item1, item2 in itertools.combinations(transaction, 2):
            if (bucket_count_level1[hash_level1(item1, item2, num_buckets)] >= min_support and
                bucket_count_level2[hash_level2(item1, item2, num_buckets)] >= min_support):
                pair_count[(item1, item2)] += 1

    return frequent_items, {pair for pair, count in pair_count.items() if count >= min_support}

def bucket_frequent_pairs(frequent_pairs, num_buckets):
    """Place frequent pairs into buckets based on hash function values."""
    buckets_level1 = defaultdict(list)
    buckets_level2 = defaultdict(list)
    for item1, item2 in frequent_pairs:
        buckets_level1[hash_level1(item1, item2, num_buckets)].append((item1, item2))
        buckets_level2[hash_level2(item1, item2, num_buckets)].append((item1, item2))
    return buckets_level1, buckets_level2

def print_bucketed_pairs(buckets_level1, buckets_level2):
    """Print the bucketed pairs."""
    print("\nFrequent Pairs Bucketed by Hash Level 1:")
    for bucket_index, pairs in buckets_level1.items():
        print(f"Bucket {bucket_index}: {pairs}")

    print("\nFrequent Pairs Bucketed by Hash Level 2:")
    for bucket_index, pairs in buckets_level2.items():
        print(f"Bucket {bucket_index}: {pairs}")

def calculate_and_print_buckets(frequent_pairs, num_buckets):
    """Calculate and print the bucket numbers for given frequent pairs."""
    print("\nCalculating Bucket Numbers for Frequent Pairs:")
    for pair in frequent_pairs:
        item1, item2 = pair
        print(f"Pair {pair}:")
        print(f"    Level 1 Bucket: {hash_level1(item1, item2, num_buckets)}")
        print(f"    Level 2 Bucket: {hash_level2(item1, item2, num_buckets)}")

def main():
    transactions = [
        [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [1, 3, 5],
        [2, 4, 6], [1, 3, 4], [2, 4, 5], [3, 5, 6], [1, 2, 4],
        [2, 3, 5], [3, 4, 6]
    ]
    min_support = 4  # Minimum support threshold
    num_buckets = 11  # Number of buckets for hashing

    frequent_items, frequent_pairs = apriori_with_hashing(transactions, min_support, num_buckets)

    print("Frequent Items:", frequent_items)
    print("Frequent Pairs:", frequent_pairs)

    # Bucket and print the frequent pairs
    buckets_level1, buckets_level2 = bucket_frequent_pairs(frequent_pairs, num_buckets)
    print_bucketed_pairs(buckets_level1, buckets_level2)

    # Calculate and print the bucket numbers for the frequent pairs
    calculate_and_print_buckets(frequent_pairs, num_buckets)

if __name__ == '__main__':
    main()

Frequent Items: {1, 2, 3, 4, 5, 6}
Frequent Pairs: {(2, 4), (3, 4), (3, 5)}

Frequent Pairs Bucketed by Hash Level 1:
Bucket 8: [(2, 4)]
Bucket 1: [(3, 4)]
Bucket 4: [(3, 5)]

Frequent Pairs Bucketed by Hash Level 2:
Bucket 6: [(2, 4)]
Bucket 7: [(3, 4)]
Bucket 8: [(3, 5)]

Calculating Bucket Numbers for Frequent Pairs:
Pair (2, 4):
    Level 1 Bucket: 8
    Level 2 Bucket: 6
Pair (3, 4):
    Level 1 Bucket: 1
    Level 2 Bucket: 7
Pair (3, 5):
    Level 1 Bucket: 4
    Level 2 Bucket: 8


#Second Example

In [3]:
from collections import defaultdict
import itertools

def load_transactions(file_path):
    """Loads transactions from a file and splits items by spaces."""
    with open(file_path, 'r') as f:
        transactions = [line.strip().split() for line in f]
    return transactions

def convert_to_numeric(transactions):
    """Converts item names to numeric IDs."""
    item_to_id = {}
    id_to_item = {}
    current_id = 0

    for transaction in transactions:
        for item in transaction:
            if item not in item_to_id:
                item_to_id[item] = current_id
                id_to_item[current_id] = item
                current_id += 1

    numeric_transactions = [
        [item_to_id[item] for item in transaction]
        for transaction in transactions
    ]
    return numeric_transactions, item_to_id, id_to_item

def hash_level1(i, j):
    """Hash function for level 1: (i * j) % 11."""
    return (i * j) % 11

def hash_level2(i, j):
    """Hash function for level 2: (i + j) % 11."""
    return (i + j) % 11

def apriori_with_hashing(transactions, min_support, num_buckets):
    """Find frequent itemsets using Apriori with hashing."""

    def count_single_items(transactions):
        """Count occurrences of single items."""
        count = defaultdict(int)
        for transaction in transactions:
            for item in transaction:
                count[item] += 1
        return count

    def generate_candidate_pairs(itemsets, transactions):
        """Generate candidate pairs and count their occurrences using hashing."""
        bucket_count_level1 = [0] * num_buckets
        bucket_count_level2 = [0] * num_buckets

        for transaction in transactions:
            for item1, item2 in itertools.combinations(transaction, 2):
                bucket_index_level1 = hash_level1(item1, item2)
                bucket_index_level2 = hash_level2(item1, item2)

                bucket_count_level1[bucket_index_level1] += 1
                bucket_count_level2[bucket_index_level2] += 1

        return bucket_count_level1, bucket_count_level2

    def count_frequent_pairs(transactions, bucket_count_level1, bucket_count_level2):
        """Count the frequency of pairs that passed the hash table filter."""
        pair_count = defaultdict(int)

        for transaction in transactions:
            for item1, item2 in itertools.combinations(transaction, 2):
                bucket_index_level1 = hash_level1(item1, item2)
                bucket_index_level2 = hash_level2(item1, item2)

                if bucket_count_level1[bucket_index_level1] >= min_support:
                    if bucket_count_level2[bucket_index_level2] >= min_support:
                        pair_count[(item1, item2)] += 1

        return pair_count

    # Step 1: Find frequent 1-itemsets
    single_item_count = count_single_items(transactions)
    frequent_items = {item for item, count in single_item_count.items() if count >= min_support}

    # Step 2: Generate candidate pairs
    filtered_transactions = [
        [item for item in transaction if item in frequent_items]
        for transaction in transactions
    ]

    bucket_count_level1, bucket_count_level2 = generate_candidate_pairs(frequent_items, filtered_transactions)

    # Step 3: Count frequent pairs
    frequent_pairs_count = count_frequent_pairs(filtered_transactions, bucket_count_level1, bucket_count_level2)
    frequent_pairs = {
        pair for pair, count in frequent_pairs_count.items() if count >= min_support
    }

    return frequent_items, frequent_pairs

def bucket_frequent_pairs(frequent_pairs):
    """Place frequent pairs into buckets based on hash function values."""
    buckets_level1 = defaultdict(list)
    buckets_level2 = defaultdict(list)

    for (item1, item2) in frequent_pairs:
        bucket_index_level1 = hash_level1(item1, item2)
        bucket_index_level2 = hash_level2(item1, item2)

        buckets_level1[bucket_index_level1].append((item1, item2))
        buckets_level2[bucket_index_level2].append((item1, item2))

    return buckets_level1, buckets_level2

def print_bucketed_pairs(buckets_level1, buckets_level2):
    """Print the bucketed pairs."""
    print("\nFrequent Pairs Bucketed by Hash Level 1:")
    for bucket_index, pairs in buckets_level1.items():
        print(f"Bucket {bucket_index}: {pairs}")

    print("\nFrequent Pairs Bucketed by Hash Level 2:")
    for bucket_index, pairs in buckets_level2.items():
        print(f"Bucket {bucket_index}: {pairs}")

def calculate_and_print_buckets(frequent_pairs):
    """Calculate and print the bucket numbers for given frequent pairs."""
    print("\nCalculating Bucket Numbers for Frequent Pairs:")

    for pair in frequent_pairs:
        item1, item2 = pair
        bucket_index_level1 = hash_level1(item1, item2)
        bucket_index_level2 = hash_level2(item1, item2)

        print(f"Pair {pair}:")
        print(f"    Level 1 Bucket: {bucket_index_level1}")
        print(f"    Level 2 Bucket: {bucket_index_level2}")

def main():
    file_path = '/content/data.txt'
    min_support = 4  # Minimum support threshold
    num_buckets = 11  # Number of buckets for hashing

    transactions = load_transactions(file_path)
    numeric_transactions, item_to_id, id_to_item = convert_to_numeric(transactions)

    frequent_items, frequent_pairs = apriori_with_hashing(numeric_transactions, min_support, num_buckets)

    print("Frequent Items (Numeric):", frequent_items)
    print("Frequent Pairs (Numeric):", frequent_pairs)

    # Convert numeric results back to item names
    item_names = {v: k for k, v in item_to_id.items()}
    frequent_items_names = {item_names[item] for item in frequent_items}
    frequent_pairs_names = {(item_names[pair[0]], item_names[pair[1]]) for pair in frequent_pairs}

    print("Frequent Items:", frequent_items_names)
    print("Frequent Pairs:", frequent_pairs_names)

    # Bucket and print the frequent pairs
    buckets_level1, buckets_level2 = bucket_frequent_pairs(frequent_pairs)
    print_bucketed_pairs(buckets_level1, buckets_level2)

    # Calculate and print the bucket numbers for the frequent pairs
    calculate_and_print_buckets(frequent_pairs)

if __name__ == '__main__':
    main()

Frequent Items (Numeric): {0, 1, 2, 3}
Frequent Pairs (Numeric): {(0, 1), (1, 2), (2, 1), (3, 1), (0, 3), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2), (1, 3)}
Frequent Items: {'bread', 'milk', 'butter', 'eggs'}
Frequent Pairs: {('milk', 'butter'), ('butter', 'milk'), ('bread', 'eggs'), ('eggs', 'bread'), ('bread', 'butter'), ('milk', 'bread'), ('butter', 'eggs'), ('butter', 'bread'), ('bread', 'milk'), ('eggs', 'butter'), ('milk', 'eggs'), ('eggs', 'milk')}

Frequent Pairs Bucketed by Hash Level 1:
Bucket 0: [(0, 1), (0, 3), (2, 0), (3, 0), (0, 2), (1, 0)]
Bucket 2: [(1, 2), (2, 1)]
Bucket 3: [(3, 1), (1, 3)]
Bucket 6: [(2, 3), (3, 2)]

Frequent Pairs Bucketed by Hash Level 2:
Bucket 1: [(0, 1), (1, 0)]
Bucket 3: [(1, 2), (2, 1), (0, 3), (3, 0)]
Bucket 4: [(3, 1), (1, 3)]
Bucket 2: [(2, 0), (0, 2)]
Bucket 5: [(2, 3), (3, 2)]

Calculating Bucket Numbers for Frequent Pairs:
Pair (0, 1):
    Level 1 Bucket: 0
    Level 2 Bucket: 1
Pair (1, 2):
    Level 1 Bucket: 2
    Level 2 Bucket: 