In [5]:
import pandas as pd
from itertools import chain, combinations
from collections import defaultdict
import os
import csv
import ast
import sys

In [6]:
# Step 1: Reading and Processing CSV Files
def read_transactions(csv_folder):
    transactions = []
    for filename in os.listdir(csv_folder):
        # print(f'filename = {filename}')
        if filename.endswith('.csv'):
            file = open(f"{csv_folder}/{filename}") 
            csvreader = csv.reader(file)
            csv.field_size_limit(1000000)


            for row in csvreader:
                # Evaluate the string in each row as a Python list
                transaction = ast.literal_eval(row[0])
                transactions.append(transaction)

            file.close()
    return transactions

In [7]:
# Step 2: Candidate Generation
def generate_candidate_itemsets(transactions, size=1):
    all_items = set(item for transaction in transactions for item in transaction)
    return set(combinations(all_items, size))

def generate_candidate_itemsets_from_previous(frequent_itemsets, size):
    all_items = set()
    for itemset in frequent_itemsets:
        all_items.update(itemset)
    return set(combinations(all_items, size))

# Step 3: Support Calculation and Pruning
def calculate_support(transactions, candidates, min_support):
    candidate_counts = defaultdict(int)
    for transaction in transactions:
        for candidate in candidates:
            if set(candidate).issubset(transaction):
                candidate_counts[candidate] += 1
    total_transactions = len(transactions)
    return {candidate: count / total_transactions for candidate, count in candidate_counts.items() if count / total_transactions >= min_support}

# Step 5: Saving the Frequent Itemsets
def save_frequent_itemsets_csv(frequent_itemsets, output_file):
    # Convert the frequent itemsets dictionary into a list of tuples
    itemsets_list = [(list(itemset), support) for itemset, support in frequent_itemsets.items()]

    # Create a DataFrame from this list
    itemsets_df = pd.DataFrame(itemsets_list, columns=['Itemset', 'Support'])

    # Save the DataFrame to a CSV file
    itemsets_df.to_csv(output_file, index=False)

In [8]:
# Main Apriori Function
def apriori(csv_folder, max_length, min_support, output_file):
    transactions = read_transactions(csv_folder)
    all_frequent_itemsets = {}

    for k in range(1, max_length + 1):
        
        print(f"== Generating {k} Itemsets ==")
        if k == 1:
            candidate_itemsets = generate_candidate_itemsets(transactions, size=k)
        else:
            candidate_itemsets = generate_candidate_itemsets_from_previous(all_frequent_itemsets, k)
        print(f"== {k} Frequent Itemsets generated ==")

        print(f"== Calculating support for {k} ==")
        frequent_itemsets = calculate_support(transactions, candidate_itemsets, min_support)

        print(f"== Calculated supported ==")
        if not frequent_itemsets:
            break

        all_frequent_itemsets.update(frequent_itemsets)
        print(f"== Finished {k} Frequent Itemsets ==")

    save_frequent_itemsets_csv(all_frequent_itemsets, output_file)
    return all_frequent_itemsets

# Run the Apriori algorithm
max_length = 5
min_support = 0.05

final_frequent_itemsets = apriori(
    csv_folder='../Dataset/basket_chunks/', #EDIT THIS TO YOUR CHUNK FOLDER
    max_length=max_length, 
    min_support=min_support, 
    output_file=f'../Frequent Itemsets/basket_{min_support}_{max_length}.csv'
)
print("Done")

== Generating 1 Itemsets ==
== 1 Frequent Itemsets generated ==
== Calculating support for 1 ==
== Calculated supported ==
== Finished 1 Frequent Itemsets ==
== Generating 2 Itemsets ==
== 2 Frequent Itemsets generated ==
== Calculating support for 2 ==
== Calculated supported ==
Done
