In [None]:
import pandas as pd

# Load the dataset
file_path = "..dataset/Product Categorization - DATA602ProjectCleanedNew 2.csv"
df = pd.read_csv(file_path)

# Filter necessary columns
df = df[['InvoiceNo', 'Description', 'Country']]

# Group transactions by Country and InvoiceNo
transactions_by_country = df.groupby(['Country', 'InvoiceNo'])['Description'].apply(list).reset_index()

# Group the data by InvoiceNo and aggregate the product descriptions for Eclat processing
transactions = transactions_by_country.groupby('InvoiceNo')['Description'].apply(list).values.tolist()

# Convert transactions into vertical format
def transactions_to_vertical_format(transactions):
    vertical_format = {}
    for index, transaction in enumerate(transactions):
        for item in transaction:
            # Flatten if the item is a list
            if isinstance(item, list):
                for sub_item in item:
                    sub_item = str(sub_item)  # Convert to string
                    if sub_item not in vertical_format:
                        vertical_format[sub_item] = set()
                    vertical_format[sub_item].add(index)
            else:
                item = str(item)  # Convert to string
                if item not in vertical_format:
                    vertical_format[item] = set()
                vertical_format[item].add(index)
    return vertical_format

# Function to perform Eclat algorithm and find frequent itemsets
def eclat(vertical_format, min_support_count, num_transactions):
    frequent_itemsets = []

    # Calculate support for single items
    for item, transactions in vertical_format.items():
        support_count = len(transactions)
        if support_count >= min_support_count:
            support = support_count / num_transactions
            frequent_itemsets.append(((item,), support))
    return frequent_itemsets

# Convert to vertical format
vertical_format = transactions_to_vertical_format(transactions)

# Calculate the total number of transactions
num_transactions = len(transactions)

# Set minimum support to 5% of transactions
min_support = 0.02
min_support_count = min_support * num_transactions

# Find frequent itemsets
frequent_itemsets = eclat(vertical_format, min_support_count, num_transactions)

# Print the frequent itemsets with their support
for itemset, support in frequent_itemsets:
    print(f"Itemset: {itemset}, Support: {support}")


Itemset: ('HAND WARMER UNION JACK',), Support: 0.02534533012292485
Itemset: ('FELTCRAFT PRINCESS CHARLOTTE DOLL',), Support: 0.02382461031554936
Itemset: ('HOME BUILDING BLOCK WORD',), Support: 0.0423267013052845
Itemset: ('LOVE BUILDING BLOCK WORD',), Support: 0.03402610569002661
Itemset: ('DOORMAT NEW ENGLAND',), Support: 0.03199847928019262
Itemset: ('JAM MAKING SET WITH JARS',), Support: 0.04821949055886453
Itemset: ('ALARM CLOCK BAKELIKE GREEN',), Support: 0.04543150424534279
Itemset: ('WOOD 2 DRAWER CABINET WHITE FINISH',), Support: 0.031111392725890256
Itemset: ('WOODEN PICTURE FRAME WHITE FINISH',), Support: 0.0506906602458497
Itemset: ('WOODEN FRAME ANTIQUE WHITE',), Support: 0.04669877075148904
Itemset: ('JUMBO BAG PINK POLKADOT',), Support: 0.04695222405271829
Itemset: ('JUMBO  BAG BAROQUE BLACK WHITE',), Support: 0.035356735521480165
Itemset: ('STRAWBERRY CHARLOTTE BAG',), Support: 0.027182866556836903
Itemset: ('LUNCH BAG RED RETROSPOT',), Support: 0.0710936509948042
Items