In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 1)

In [65]:
df = pd.read_csv('basket.csv')
df.head()

Unnamed: 0,Basket_ID,Items
0,1,"bread, eggs, milk, cheese"
1,2,"beer, cheese, milk, diapers"
2,3,"milk, cheese, beer, bread, diapers"
3,4,"eggs, butter, bread"
4,5,"bread, diapers"


In [66]:
from itertools import combinations
from collections import defaultdict

df['Items'] = df['Items'].apply(lambda x: set(x.split(', ')))
transactions = df['Items'].tolist()

# Calculating frequency and support for single-item itemsets
item_counts = defaultdict(int)
num_transactions = len(transactions)

In [67]:
transactions

[{'bread', 'cheese', 'eggs', 'milk'},
 {'beer', 'cheese', 'diapers', 'milk'},
 {'beer', 'bread', 'cheese', 'diapers', 'milk'},
 {'bread', 'butter', 'eggs'},
 {'bread', 'diapers'},
 {'bread', 'diapers', 'milk'},
 {'beer', 'bread', 'cheese', 'diapers', 'eggs'},
 {'butter', 'diapers'},
 {'beer', 'butter', 'cheese'},
 {'bread', 'cheese', 'eggs', 'milk'},
 {'butter', 'diapers', 'milk'},
 {'butter', 'diapers', 'eggs', 'milk'},
 {'beer', 'bread', 'butter', 'diapers', 'milk'},
 {'butter', 'milk'},
 {'bread', 'butter', 'diapers', 'eggs', 'milk'},
 {'bread', 'butter'},
 {'beer', 'bread', 'diapers', 'eggs', 'milk'},
 {'beer', 'milk'},
 {'beer', 'bread', 'butter', 'diapers', 'eggs'},
 {'beer', 'bread', 'butter', 'milk'},
 {'beer', 'bread', 'cheese', 'eggs'},
 {'beer', 'eggs'},
 {'cheese', 'eggs', 'milk'},
 {'bread', 'butter', 'cheese', 'eggs'},
 {'beer', 'bread', 'eggs', 'milk'},
 {'bread', 'diapers', 'eggs', 'milk'},
 {'bread', 'butter', 'cheese'},
 {'butter', 'cheese', 'diapers', 'eggs'},
 {'bre

#### Frequency of each itemset

In [68]:
for transaction in transactions:
    for item in transaction:
        item_counts[item] += 1
item_counts

defaultdict(int,
            {'eggs': 16,
             'cheese': 11,
             'milk': 17,
             'bread': 20,
             'diapers': 15,
             'beer': 13,
             'butter': 14})

#### Support of each itemset

In [69]:
item_support = {item: count / num_transactions for item, count in item_counts.items()}
item_support

{'eggs': 0.5333333333333333,
 'cheese': 0.36666666666666664,
 'milk': 0.5666666666666667,
 'bread': 0.6666666666666666,
 'diapers': 0.5,
 'beer': 0.43333333333333335,
 'butter': 0.4666666666666667}

In [60]:
rules_association = []
for item1, item2 in combinations(item_counts.keys(), 2):
    
    supportitem_1 = item_support[item1]      # the support of individual items and pair
    supportitem_2 = item_support[item2]
    
    pair_count = sum(1 for transaction in transactions if {item1, item2}.issubset(transaction))  # the frequency and support of the chosen pair
    support_pair = pair_count / num_transactions

    if pair_count > 0:
       
        confidence1_to_2 = support_pair / supportitem_1  # Confidence is gotten from pairsupport(item2 | item1) and pairsupport(item1 | item2)
        confidence2_to_1 = support_pair / supportitem_2

        lift1_to_2 = confidence1_to_2 / supportitem_2   
        lift2_to_1 = confidence2_to_1 / supportitem_1


        rules_association.append((item1, item2, pair_count, support_pair, confidence1_to_2, lift1_to_2))
        rules_association.append((item2, item1, pair_count, support_pair, confidence2_to_1, lift2_to_1))

In [61]:
# Prepare data for reporting
frequencies = pd.DataFrame({'Item': item_counts.keys(), 'Frequency': item_counts.values()})
frequencies['Support'] = frequencies['Item'].map(item_support)
rules_df = pd.DataFrame(rules_association, columns=[
    'Item1', 'Item2', 'Pair_Count', 'Support Pair', 'Confidence', 'Lift'
])
frequencies

Unnamed: 0,Item,Frequency,Support
0,eggs,16,0.533333
1,cheese,11,0.366667
2,milk,17,0.566667
3,bread,20,0.666667
4,diapers,15,0.5
5,beer,13,0.433333
6,butter,14,0.466667


In [62]:
rules_df.head()

Unnamed: 0,Item1,Item2,Pair_Count,Support Pair,Confidence,Lift
0,eggs,cheese,7,0.233333,0.4375,1.193182
1,cheese,eggs,7,0.233333,0.636364,1.193182
2,eggs,milk,8,0.266667,0.5,0.882353
3,milk,eggs,8,0.266667,0.470588,0.882353
4,eggs,bread,12,0.4,0.75,1.125


In [None]:
### MBA analysis, not apriori

import pandas as pd
from itertools import combinations

# Load the dataset
file_path = "/mnt/data/basket.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df.head()


from collections import defaultdict

# Convert the 'Items' column into a list of sets
df['Items'] = df['Items'].apply(lambda x: set(x.split(', ')))

# Create a list of all unique items
all_items = set(item for sublist in df['Items'] for item in sublist)

# Compute item frequencies
item_frequencies = defaultdict(int)
for items in df['Items']:
    for item in items:
        item_frequencies[item] += 1

# Convert to DataFrame for better visualization
item_freq_df = pd.DataFrame(item_frequencies.items(), columns=['Item', 'Frequency'])
item_freq_df = item_freq_df.sort_values(by='Frequency', ascending=False)

# Display the frequency of each itemset
import ace_tools as tools
tools.display_dataframe_to_user(name="Item Frequency", dataframe=item_freq_df)


# Compute Support of each item
total_transactions = len(df)

# Calculate support for each item
item_support = {item: freq / total_transactions for item, freq in item_frequencies.items()}

# Convert to DataFrame for better visualization
item_support_df = pd.DataFrame(item_support.items(), columns=['Item', 'Support'])
item_support_df = item_support_df.sort_values(by='Support', ascending=False)

# Display the support of each itemset
tools.display_dataframe_to_user(name="Item Support", dataframe=item_support_df)


# Generate all possible item pairs for association rules
pair_frequencies = defaultdict(int)
for items in df['Items']:
    for pair in combinations(items, 2):
        pair_frequencies[pair] += 1

# Compute confidence and lift for association rules
association_rules = []
for (A, B), freq_AB in pair_frequencies.items():
    support_AB = freq_AB / total_transactions
    confidence_A_to_B = support_AB / item_support[A]
    confidence_B_to_A = support_AB / item_support[B]
    lift_A_to_B = confidence_A_to_B / item_support[B]
    lift_B_to_A = confidence_B_to_A / item_support[A]

    association_rules.append([A, B, support_AB, confidence_A_to_B, lift_A_to_B])
    association_rules.append([B, A, support_AB, confidence_B_to_A, lift_B_to_A])

# Convert to DataFrame for better visualization
association_rules_df = pd.DataFrame(association_rules, columns=['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])
association_rules_df = association_rules_df.sort_values(by='Lift', ascending=False)

# Display the association rules with confidence and lift
tools.display_dataframe_to_user(name="Association Rules", dataframe=association_rules_df)



########## with libraries

# Convert the dataset into a binary transaction matrix
encoded_items = df['Items'].apply(lambda x: {item: 1 for item in x}).tolist()
encoded_df = pd.DataFrame(encoded_items).fillna(0).astype(int)

# Compute frequent itemsets using a minimum support threshold of 0.1
min_support = 0.1
total_transactions = len(encoded_df)
item_supports = encoded_df.mean()

frequent_items = item_supports[item_supports >= min_support].reset_index()
frequent_items.columns = ['Item', 'Support']

# Display the frequent itemsets
tools.display_dataframe_to_user(name="Frequent Itemsets", dataframe=frequent_items)

# Compute association rules
association_rules_list = []
for (A, B) in combinations(frequent_items['Item'], 2):
    support_A = item_supports[A]
    support_B = item_supports[B]
    support_AB = (encoded_df[A] & encoded_df[B]).mean()
    
    if support_AB >= min_support:
        confidence_A_to_B = support_AB / support_A
        confidence_B_to_A = support_AB / support_B
        lift_A_to_B = confidence_A_to_B / support_B
        lift_B_to_A = confidence_B_to_A / support_A

        association_rules_list.append([A, B, support_AB, confidence_A_to_B, lift_A_to_B])
        association_rules_list.append([B, A, support_AB, confidence_B_to_A, lift_B_to_A])

# Convert to DataFrame
association_rules_df = pd.DataFrame(association_rules_list, columns=['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])
association_rules_df = association_rules_df.sort_values(by='Lift', ascending=False)

# Display the association rules
tools.display_dataframe_to_user(name="Association Rules", dataframe=association_rules_df)



### 2)

#### Apriori algorithm

In [70]:
df1 = pd.read_csv('Market_Basket_Optimisation.csv')
df1.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


In [80]:
transactions = df1.apply(lambda row: row.dropna().tolist(), axis=1).tolist()

def apriori(transactions, min_support):
    def get_support_count(itemsets, transactions):
        support_counts = defaultdict(int)  #Calculating support count for each itemset
        for transaction in transactions:
            for item in itemsets:
                if item.issubset(transaction):
                    support_counts[item] += 1
        return support_counts

    single_items = {frozenset([item]) for transaction in transactions for item in transaction}
    #print(single_items)
    #single_items = {item for transaction in transactions for item in transaction}
    support_counts = get_support_count(single_items, transactions)
    #print(support_counts)
    
    frequent_items = {itemset: count for itemset, count in support_counts.items() 
                         if count / len(transactions) >= min_support}   #using the minimum support to filter..
    
    k = 2
    all_frequent_itemsets = [frequent_items]
    while frequent_items:
        candidates = set(
            [frozenset(item1.union(item2)) for item1 in frequent_items
             for item2 in frequent_items if len(item1.union(item2)) == k]
        )
        support_counts = get_support_count(candidates, transactions)
        frequent_items = {itemset: count for itemset, count in support_counts.items() 
                             if count / len(transactions) >= min_support}
        
        if frequent_itemsets:
            all_frequent_itemsets.append(frequent_items)
        
        k += 1
    
    return all_frequent_itemsets


In [96]:
transactions[:10]

[['burgers', 'meatballs', 'eggs'],
 ['chutney'],
 ['turkey', 'avocado'],
 ['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea'],
 ['low fat yogurt'],
 ['whole wheat pasta', 'french fries'],
 ['soup', 'light cream', 'shallot'],
 ['frozen vegetables', 'spaghetti', 'green tea'],
 ['french fries'],
 ['eggs', 'pet food']]

In [95]:
min_support = 0.02 
frequent_itemsets = apriori(transactions, min_support)
frequent_itemsets_cleaned[:10]

# frequent_itemsets_cleaned = [
#     {tuple(sorted(itemset)): count for itemset, count in level.items()}
#     for level in frequent_itemsets
# ]


[{'burgers'},
 {'burgers', 'eggs'},
 {'burgers', 'mineral water'},
 {'burgers', 'french fries'},
 {'burgers', 'spaghetti'},
 {'meatballs'},
 {'eggs'},
 {'eggs', 'mineral water'},
 {'eggs', 'milk'},
 {'eggs', 'green tea'}]

#### Improved Apriori algorithm

In [32]:
def improved_apriori(transactions, min_support):

    item_support = defaultdict(int)
    total_transactions = len(transactions)
    min_support_count = min_support * total_transactions

    for transaction in transactions:
        for item in transaction:
            item_support[frozenset([item])] += 1

    current_itemsets = {itemset: count for itemset, count in item_support.items() if count >= min_support_count} #removing 1 itemsets
    frequent_itemsets = list(current_itemsets.items())

    k = 2
    while current_itemsets:
        candidate_support = defaultdict(int)
        current_items = list(current_itemsets.keys())

        for i in range(len(current_items)):
            for j in range(i + 1, len(current_items)):
                candidate = current_items[i] | current_items[j]
                if len(candidate) == k:
                    for transaction in transactions:
                        if candidate.issubset(transaction):
                            candidate_support[candidate] += 1

        # remove the non-frequent candidates
        current_itemsets = {itemset: count for itemset, count in candidate_support.items() if count >= min_support_count}
        frequent_itemsets.extend(current_itemsets.items())

        k += 1

    return frequent_itemsets


frequent_itemsets_improved = improved_apriori(transactions, min_support)
frequent_itemsets_improved[:10]


[(frozenset({'burgers'}), 654),
 (frozenset({'meatballs'}), 157),
 (frozenset({'eggs'}), 1348),
 (frozenset({'turkey'}), 469),
 (frozenset({'avocado'}), 249),
 (frozenset({'mineral water'}), 1787),
 (frozenset({'milk'}), 972),
 (frozenset({'energy bar'}), 203),
 (frozenset({'whole wheat rice'}), 439),
 (frozenset({'green tea'}), 990)]

In [33]:
def generate_association_rules(frequent_itemsets, min_confidence):
    rules = []
    itemset_support = {itemset: support for itemset, support in frequent_itemsets}

    for itemset, support in frequent_itemsets:
        if len(itemset) > 1:
            for item in map(frozenset, combinations(itemset, len(itemset) - 1)):
                consequent = itemset - item
                if consequent and item in itemset_support:
                    confidence = support / itemset_support[item]
                    if confidence >= min_confidence:
                        rules.append((item, consequent, confidence))
    return rules


min_conf = 0.3
association_rules = generate_association_rules(frequent_itemsets_improved, min_conf)
association_rules[:10]


[(frozenset({'burgers'}), frozenset({'eggs'}), 0.3302752293577982),
 (frozenset({'milk'}), frozenset({'mineral water'}), 0.37037037037037035),
 (frozenset({'whole wheat rice'}),
  frozenset({'mineral water'}),
  0.3439635535307517),
 (frozenset({'low fat yogurt'}),
  frozenset({'mineral water'}),
  0.31239092495637),
 (frozenset({'soup'}), frozenset({'mineral water'}), 0.45646437994722955),
 (frozenset({'frozen vegetables'}),
  frozenset({'mineral water'}),
  0.3748251748251748),
 (frozenset({'spaghetti'}), frozenset({'mineral water'}), 0.3430321592649311),
 (frozenset({'cooking oil'}),
  frozenset({'mineral water'}),
  0.39425587467362927),
 (frozenset({'shrimp'}), frozenset({'mineral water'}), 0.32897196261682243),
 (frozenset({'chocolate'}), frozenset({'mineral water'}), 0.32113821138211385)]

In [None]:
######## apriori manualy
from collections import defaultdict

# Convert dataset into a list of sets for easier processing
transactions_new = df_new.apply(lambda row: set(row.dropna()), axis=1).tolist()

# Compute individual item frequencies
item_counts = defaultdict(int)
for transaction in transactions_new:
    for item in transaction:
        item_counts[item] += 1

# Calculate support for individual items
total_transactions_new = len(transactions_new)
min_support = 0.05  # Minimum support threshold

item_supports = {item: count / total_transactions_new for item, count in item_counts.items() if count / total_transactions_new >= min_support}

# Convert to DataFrame
frequent_items_df = pd.DataFrame(item_supports.items(), columns=['Item', 'Support'])
frequent_items_df = frequent_items_df.sort_values(by='Support', ascending=False)

# Display the frequent itemsets
tools.display_dataframe_to_user(name="Frequent Itemsets (Manual Apriori)", dataframe=frequent_items_df)

# Compute association rules
association_rules_list = []
for (A, B) in combinations(item_supports.keys(), 2):
    support_A = item_supports[A]
    support_B = item_supports[B]
    support_AB = sum(1 for t in transactions_new if A in t and B in t) / total_transactions_new

    if support_AB >= min_support:
        confidence_A_to_B = support_AB / support_A
        confidence_B_to_A = support_AB / support_B
        lift_A_to_B = confidence_A_to_B / support_B
        lift_B_to_A = confidence_B_to_A / support_A

        association_rules_list.append([A, B, support_AB, confidence_A_to_B, lift_A_to_B])
        association_rules_list.append([B, A, support_AB, confidence_B_to_A, lift_B_to_A])

# Convert to DataFrame
association_rules_df_new = pd.DataFrame(association_rules_list, columns=['Antecedent', 'Consequent', 'Support', 'Confidence', 'Lift'])
association_rules_df_new = association_rules_df_new.sort_values(by='Lift', ascending=False)

# Display the association rules
tools.display_dataframe_to_user(name="Association Rules (Manual Apriori)", dataframe=association_rules_df_new)


####### apriri with libraries

pip install mlxtend


# Attempting to use the `mlxtend` library again for Apriori implementation
try:
    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder

    # Convert the dataset into a list of transactions
    transactions_new = df_new.apply(lambda row: [item for item in row if isinstance(item, str)], axis=1).tolist()

    # Transform the dataset using TransactionEncoder
    te = TransactionEncoder()
    te_ary = te.fit(transactions_new).transform(transactions_new)
    df_encoded_new = pd.DataFrame(te_ary, columns=te.columns_)

    # Apply Apriori algorithm to find frequent itemsets with a minimum support of 0.05
    frequent_itemsets_new = apriori(df_encoded_new, min_support=0.05, use_colnames=True)

    # Generate association rules based on lift
    rules_new = association_rules(frequent_itemsets_new, metric="lift", min_threshold=1.0)

    # Display the frequent itemsets
    tools.display_dataframe_to_user(name="Frequent Itemsets (Apriori)", dataframe=frequent_itemsets_new)

    # Display the association rules
    tools.display_dataframe_to_user(name="Association Rules (Apriori)", dataframe=rules_new)

except ModuleNotFoundError:
    print("The `mlxtend` library is not available in this environment. Please install it using `pip install mlxtend`.")




### 3) Eclat

In [52]:
def build_vert_data(transactions):
    vertical_data = defaultdict(set)
    for tid, transaction in enumerate(transactions):
        for item in transaction:
            vertical_data[item].add(tid)
    return vertical_data


def eclat(prefix, items, vertical_data, min_support_count, frequent_itemsets):
    while items:
        item = items.pop()
        new_prefix = prefix | {item}
        tid_set = vertical_data[item]
        
        support = len(tid_set)
        if support >= min_support_count:
            frequent_itemsets.append((new_prefix, support))
            new_items = [i for i in items if len(tid_set & vertical_data[i]) >= min_support_count]  # Generating new itemsets
            new_vertical_data = {i: tid_set & vertical_data[i] for i in new_items}
            eclat(new_prefix, new_items, new_vertical_data, min_support_count, frequent_itemsets)

In [53]:
def eclat_run(transactions, min_support):
    total_transactions = len(transactions)
    min_support_count = min_support * total_transactions
    vertical_data = build_vert_data(transactions)
    frequent_itemsets = []
    J = set(vertical_data.keys())
    eclat(set(), J, vertical_data, min_support_count, frequent_itemsets)
    return frequent_itemsets

frequent_itemsets = eclat_run(transactions, min_support)
frequent_itemsets[:10]


[({'ham'}, 199),
 ({'champagne'}, 351),
 ({'milk'}, 972),
 ({'ground beef', 'milk'}, 165),
 ({'milk', 'mineral water'}, 360),
 ({'milk', 'spaghetti'}, 266),
 ({'french fries', 'milk'}, 178),
 ({'eggs', 'milk'}, 231),
 ({'chocolate', 'milk'}, 241),
 ({'frozen vegetables', 'milk'}, 177)]

In [None]:
association_rules_eclat = generate_association_rules(frequent_itemsets_eclat, min_confidence)
association_rules_eclat[:10]

In [90]:
from collections import defaultdict

# Convert transactions into vertical format
def create_vertical_format(transactions):
    item_tids = defaultdict(set)
    for tid, transaction in enumerate(transactions):
        for item in transaction:
            item_tids[item].add(tid)
    return item_tids

# Eclat algorithm to find frequent itemsets
def eclat(transactions, min_support):
    # Convert transactions to vertical format
    item_tids = create_vertical_format(transactions)
   # print(item_tids)
    n_transactions = len(transactions)
    frequent_itemsets = []
    
    # Function to calculate support based on TID sets
    def get_support(tid_set):
        return len(tid_set) / n_transactions
    
    # Depth-first search (DFS) style function to mine itemsets
    def dfs(prefix, items, k):
        for item, tids in items.items():
            # Create a new itemset by combining the current prefix with the new item
            new_itemset = prefix | {item}
            support = get_support(tids)
            
            # If the support of the new itemset meets the threshold, add it to the frequent itemsets
            if support >= min_support:
                frequent_itemsets.append((new_itemset, support))
                
                # Recursively mine larger itemsets by intersecting TID sets
                # Create a new dictionary of items with intersected TID sets
                new_items = {
                    other_item: other_tids & tids
                    for other_item, other_tids in items.items()
                    if other_item > item  # Avoid duplicates
                }
                
                dfs(new_itemset, new_items, k + 1)
    
    # Start DFS for each item in the vertical format
    dfs(set(), item_tids, 1)
    
    return frequent_itemsets

# Example Usage
transactions = df1.apply(lambda row: row.dropna().tolist(), axis=1).tolist()
min_support = 0.02
frequent_itemsets = eclat(transactions, min_support)

# Cleaned frequent itemsets
frequent_itemsets_cleaned = [itemset for itemset, support in frequent_itemsets]


In [94]:
frequent_itemsets[:10]

[({'burgers'}, 0.0872),
 ({'burgers', 'eggs'}, 0.0288),
 ({'burgers', 'mineral water'}, 0.0244),
 ({'burgers', 'french fries'}, 0.022),
 ({'burgers', 'spaghetti'}, 0.021466666666666665),
 ({'meatballs'}, 0.020933333333333335),
 ({'eggs'}, 0.17973333333333333),
 ({'eggs', 'mineral water'}, 0.05093333333333333),
 ({'eggs', 'milk'}, 0.0308),
 ({'eggs', 'green tea'}, 0.025466666666666665)]

In [99]:
#### Improvement
from collections import defaultdict

# Convert transactions into vertical format
def create_vertical_format(transactions):
    item_tids = defaultdict(set)
    for tid, transaction in enumerate(transactions):
        for item in transaction:
            item_tids[item].add(tid)
    return item_tids

# Eclat algorithm to find frequent itemsets
def eclat(transactions, min_support):
    # Convert transactions to vertical format
    item_tids = create_vertical_format(transactions)
    n_transactions = len(transactions)
    frequent_itemsets = []

    # Function to calculate support based on TID sets
    def get_support(tid_set):
        return len(tid_set) / n_transactions

    # Depth-first search (DFS) function for mining itemsets
    def extend_itemset(prefix, prefix_tid_set, candidates):
        # Iterate through candidates in decreasing order of support
        for item, tids in sorted(candidates.items(), key=lambda x: -len(x[1])):
            # Create the new candidate itemset
            new_itemset = prefix | {item}
            new_tid_set = prefix_tid_set & tids  # Transaction cover (intersection)
            support = get_support(new_tid_set)

            # Retain the itemset if it meets the minimum support threshold
            if support >= min_support:
                frequent_itemsets.append((new_itemset, support))

                # Prune and prepare new candidates for recursive extension
                new_candidates = {
                    other_item: other_tids & new_tid_set
                    for other_item, other_tids in candidates.items()
                    if other_item > item and len(other_tids & new_tid_set) >= min_support * n_transactions
                }

                # Recursively extend the new itemset
                extend_itemset(new_itemset, new_tid_set, new_candidates)

    # Start DFS with the empty itemset
    initial_candidates = {
        item: tids for item, tids in item_tids.items()
        if len(tids) >= min_support * n_transactions
    }
    extend_itemset(set(), set(range(n_transactions)), initial_candidates)

    return frequent_itemsets

# Example Usage
transactions = df1.apply(lambda row: row.dropna().tolist(), axis=1).tolist()
min_support = 0.02
frequent_itemsets = eclat(transactions, min_support)

In [None]:



# Re-import necessary libraries after execution state reset
import itertools
import pandas as pd
import ace_tools as tools

# Given dataset
data = {
    'beer':     [1, 1, 1, 0, 0, 1],
    'bread':    [0, 1, 0, 1, 1, 1],
    'icecream': [0, 0, 1, 0, 1, 0],
    'milk':     [1, 0, 0, 1, 1, 1],
    'pampers':  [1, 1, 1, 0, 0, 1],
    'pizza':    [1, 1, 1, 0, 0, 0]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Define minimum support threshold
min_support = 3

# Step 1: Calculate support for individual items (1-itemsets)
support_count = df.sum(axis=0)

# Filter frequent 1-itemsets
frequent_1_itemsets = {item: count for item, count in support_count.items() if count >= min_support}

# Step 2: Generate candidate 2-itemsets from frequent 1-itemsets
frequent_1_items = list(frequent_1_itemsets.keys())
candidate_2_itemsets = list(itertools.combinations(frequent_1_items, 2))

# Calculate support for 2-itemsets
support_2_itemsets = {}
for itemset in candidate_2_itemsets:
    support_2_itemsets[itemset] = df[list(itemset)].all(axis=1).sum()

# Filter frequent 2-itemsets
frequent_2_itemsets = {itemset: count for itemset, count in support_2_itemsets.items() if count >= min_support}

# Step 3: Generate candidate 3-itemsets from frequent 2-itemsets
frequent_2_items = list(frequent_2_itemsets.keys())
candidate_3_itemsets = list(itertools.combinations(set(itertools.chain(*frequent_2_items)), 3))

# Calculate support for 3-itemsets
support_3_itemsets = {}
for itemset in candidate_3_itemsets:
    support_3_itemsets[itemset] = df[list(itemset)].all(axis=1).sum()

# Filter frequent 3-itemsets
frequent_3_itemsets = {itemset: count for itemset, count in support_3_itemsets.items() if count >= min_support}

# Combine frequent itemsets
frequent_itemsets = {**frequent_1_itemsets, **frequent_2_itemsets, **frequent_3_itemsets}

# Display results
df_frequent_itemsets = pd.DataFrame.from_dict(frequent_itemsets, orient='index', columns=['Support'])
tools.display_dataframe_to_user(name="Frequent Itemsets", dataframe=df_frequent_itemsets)


In [None]:
# Implementing Eclat algorithm for frequent itemset mining

# Convert dataset to transaction ID (TID) format for Eclat
tid_lists = {item: set(df[df[item] == 1].index) for item in df.columns}

# Define a function for Eclat recursion
def eclat(prefix, tid_list, items, min_support):
    """
    Recursive function to generate frequent itemsets using Eclat.
    """
    frequent_itemsets = {}
    
    for i in range(len(items)):
        item = items[i]
        new_tid_list = tid_list & tid_lists[item]  # Compute transaction intersection
        support = len(new_tid_list)
        
        if support >= min_support:
            new_itemset = prefix + (item,)
            frequent_itemsets[new_itemset] = support
            frequent_itemsets.update(eclat(new_itemset, new_tid_list, items[i+1:], min_support))
    
    return frequent_itemsets

# Run Eclat algorithm
eclat_frequent_itemsets = eclat((), set(range(len(df))), list(tid_lists.keys()), min_support)

# Convert to DataFrame for display
df_eclat_itemsets = pd.DataFrame.from_dict(eclat_frequent_itemsets, orient='index', columns=['Support'])
tools.display_dataframe_to_user(name="Eclat Frequent Itemsets", dataframe=df_eclat_itemsets)


In [None]:
######## eclat manually

# Implementing the Eclat algorithm manually

# Function to find frequent itemsets using Eclat
def eclat(transactions, min_support=0.05):
    item_sets = defaultdict(set)
    total_transactions = len(transactions)
    
    # Create item transaction sets
    for tid, transaction in enumerate(transactions):
        for item in transaction:
            item_sets[item].add(tid)

    # Calculate support for each itemset
    frequent_itemsets = {item: len(tids) / total_transactions for item, tids in item_sets.items() if len(tids) / total_transactions >= min_support}

    return frequent_itemsets

# Run Eclat on the dataset
min_support = 0.05  # Minimum support threshold
frequent_itemsets_eclat = eclat(transactions_new, min_support)

# Convert to DataFrame
frequent_items_eclat_df = pd.DataFrame(frequent_itemsets_eclat.items(), columns=['Item', 'Support'])
frequent_items_eclat_df = frequent_items_eclat_df.sort_values(by='Support', ascending=False)

# Display the frequent itemsets found using Eclat
tools.display_dataframe_to_user(name="Frequent Itemsets (Eclat)", dataframe=frequent_items_eclat_df)


##### with libraries

from itertools import combinations

# Function to find frequent itemsets using Eclat with pairwise transactions
def eclat_library_based(transactions, min_support=0.05):
    item_sets = defaultdict(set)
    total_transactions = len(transactions)
    
    # Create item transaction sets
    for tid, transaction in enumerate(transactions):
        for item in transaction:
            item_sets[item].add(tid)

    # Generate itemsets of size 2 (pairwise combinations)
    frequent_itemsets = {item: len(tids) / total_transactions for item, tids in item_sets.items() if len(tids) / total_transactions >= min_support}
    
    # Generate larger itemsets (Eclat works recursively)
    pair_sets = defaultdict(set)
    for (item1, tids1), (item2, tids2) in combinations(item_sets.items(), 2):
        common_tids = tids1 & tids2  # Intersection of transactions containing both items
        if len(common_tids) / total_transactions >= min_support:
            pair_sets[(item1, item2)] = common_tids

    # Convert to DataFrame for display
    frequent_pairsets = {pair: len(tids) / total_transactions for pair, tids in pair_sets.items()}
    
    return frequent_itemsets, frequent_pairsets

# Run Eclat on the dataset
min_support = 0.05  # Minimum support threshold
frequent_itemsets_eclat, frequent_pairsets_eclat = eclat_library_based(transactions_new, min_support)

# Convert single item frequent sets to DataFrame
frequent_items_eclat_df = pd.DataFrame(frequent_itemsets_eclat.items(), columns=['Item', 'Support'])
frequent_items_eclat_df = frequent_items_eclat_df.sort_values(by='Support', ascending=False)

# Convert frequent pairs to DataFrame
frequent_pairsets_eclat_df = pd.DataFrame(frequent_pairsets_eclat.items(), columns=['Item Pair', 'Support'])
frequent_pairsets_eclat_df = frequent_pairsets_eclat_df.sort_values(by='Support', ascending=False)

# Display the frequent itemsets found using Eclat
tools.display_dataframe_to_user(name="Frequent Itemsets (Eclat - Single Items)", dataframe=frequent_items_eclat_df)

# Display the frequent item pairs found using Eclat
tools.display_dataframe_to_user(name="Frequent Itemsets (Eclat - Pairs)", dataframe=frequent_pairsets_eclat_df)


