<a href="https://colab.research.google.com/github/swapnil14g/dmdw/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import itertools
from collections import Counter

def openfile(filename):
    with open(filename, 'rb') as file:
        data = file.readlines()
    l=[]
    for row in data:
        l.append(row.strip().split())
    return l

def apriori(dataset, min_support):
    """
    Implements the Apriori algorithm to find frequent itemsets.

    Args:
        dataset: A list of transactions (lists of items).
        min_support: Minimum support threshold for an itemset to be considered frequent.

    Returns:
        A list of frequent itemsets.
    """

    frequent_itemsets = []
    C1 = Counter(item for transaction in dataset for item in transaction)
    L1 = [item for item, count in C1.items() if count >= min_support]
    frequent_itemsets.append(L1)

    k = 2
    while L1:
        Ck = []
        for i in range(len(L1) - 1):
            for j in range(i + 1, len(L1)):
                candidate = L1[i] + L1[j]
                subsets = [candidate[i:j] for i in range(k - 1)]
                if all(subset in frequent_itemsets[k - 2] for subset in subsets):
                    Ck.append(candidate)

        Lk = [item for item in Ck if count_support(item, dataset) >= min_support]
        frequent_itemsets.append(Lk)
        L1 = Lk
        k += 1

    return frequent_itemsets

def count_support(itemset, dataset):
    """
    Calculates the support of an itemset in the dataset.

    Args:
        itemset: A list of items.
        dataset: A list of transactions (lists of items).

    Returns:
        The support of the itemset.
    """

    count = 0
    for transaction in dataset:
        if set(itemset).issubset(set(transaction)):
            count += 1
    return count / len(dataset)

def create_mapping(frequent_itemsets):
    """
    Creates a mapping from frequent itemsets to unique identifiers.

    Args:
        frequent_itemsets: A list of frequent itemsets.

    Returns:
        A dictionary mapping frequent itemsets to unique identifiers.
    """

    mapping = {}
    for i, itemset in enumerate(frequent_itemsets):
        # Convert the itemset (list) to a tuple to make it hashable
        mapping[tuple(itemset)] = chr(65 + i)
    return mapping

def compress_dataset(dataset, frequent_itemsets, mapping):
    """
    Compresses the dataset using the given frequent itemsets and mapping.

    Args:
        dataset: A list of transactions (lists of items).
        frequent_itemsets: A list of frequent itemsets.
        mapping: A dictionary mapping frequent itemsets to unique identifiers.

    Returns:
        A list of compressed transactions.
    """

    compressed_dataset = []
    for transaction in dataset:
        compressed_transaction = []
        for itemset in frequent_itemsets:
            if set(itemset).issubset(set(transaction)):
                # Convert the itemset to a tuple before looking it up in the mapping
                compressed_transaction.append(mapping[tuple(itemset)])
        compressed_dataset.append(compressed_transaction)
    return compressed_dataset

def decompress_dataset(compressed_dataset, mapping):
    """
    Decompresses the compressed dataset using the given mapping.

    Args:
        compressed_dataset: A list of compressed transactions.
        mapping: A dictionary mapping frequent itemsets to unique identifiers.

    Returns:
        A list of original transactions.
    """

    original_dataset = []
    for compressed_transaction in compressed_dataset:
        original_transaction = []
        for identifier in compressed_transaction:
            if identifier in mapping:  # Check if identifier exists in mapping
                original_transaction.extend(mapping[identifier])
            else:
                # Handle missing identifier (e.g., log a warning or raise an exception)
                print(f"Warning: Identifier '{identifier}' not found in mapping.")
        original_dataset.append(original_transaction)
    return original_dataset

def calculate_compression_metrics(original_dataset, compressed_dataset, mapping):
    """
    Calculates compression ratio, storage savings, and compression percentage.

    Args:
        original_dataset: The original dataset.
        compressed_dataset: The compressed dataset.
        mapping: The mapping used for compression.

    Returns:
        A tuple containing compression ratio, storage savings, and compression percentage.
    """

    original_size = sum(len(transaction) for transaction in original_dataset)
    compressed_size = sum(len(transaction) for transaction in compressed_dataset) + len(mapping)

    compression_ratio = original_size / compressed_size
    storage_savings = original_size - compressed_size
    compression_percentage = (storage_savings / original_size) * 100

    return compression_ratio, storage_savings, compression_percentage


dataset = openfile('/content/drive/MyDrive/Colab Notebooks/D_small.dat')

min_support = 0.5
frequent_itemsets = apriori(dataset, min_support)
mapping = create_mapping(frequent_itemsets)
compressed_dataset = compress_dataset(dataset, frequent_itemsets, mapping)
decompressed_dataset = decompress_dataset(compressed_dataset, mapping)
compression_ratio, storage_savings, compression_percentage = calculate_compression_metrics(dataset, compressed_dataset, mapping)

print("Compression ratio:", compression_ratio)
print("Storage savings:", storage_savings)
print("Compression percentage:", compression_percentage)
print("Frequent itemsets:", frequent_itemsets)
print("Mapping:", mapping)
print("Compressed dataset:", compressed_dataset)
print("Decompressed dataset:", decompressed_dataset)

Compression ratio: 36.976860537836146
Storage savings: 115054
Compression percentage: 97.29560599397897
Frequent itemsets: [[b'1', b'3', b'5', b'7', b'9', b'11', b'13', b'15', b'17', b'19', b'21', b'23', b'25', b'27', b'29', b'31', b'34', b'36', b'38', b'40', b'42', b'44', b'46', b'48', b'50', b'52', b'54', b'56', b'58', b'60', b'62', b'64', b'66', b'68', b'70', b'72', b'74', b'12', b'16', b'20', b'47', b'51', b'63', b'24', b'65', b'43', b'32', b'73', b'4', b'33', b'39', b'71', b'69', b'10', b'18', b'14', b'8', b'49', b'55', b'6', b'37', b'28', b'26', b'75', b'57', b'45', b'22', b'2', b'67', b'35', b'53', b'41', b'61', b'30', b'59'], []]
Mapping: {(b'1', b'3', b'5', b'7', b'9', b'11', b'13', b'15', b'17', b'19', b'21', b'23', b'25', b'27', b'29', b'31', b'34', b'36', b'38', b'40', b'42', b'44', b'46', b'48', b'50', b'52', b'54', b'56', b'58', b'60', b'62', b'64', b'66', b'68', b'70', b'72', b'74', b'12', b'16', b'20', b'47', b'51', b'63', b'24', b'65', b'43', b'32', b'73', b'4', b'33',