In [1]:
import pandas as pd
from itertools import chain, combinations
from collections import defaultdict
import os
import csv
import ast
import sys

In [2]:
# Step 1: Reading and Processing CSV Files one at a time
def read_transactions(csv_folder):
    for filename in os.listdir(csv_folder):
        if filename.endswith('.csv'):
            with open(f"{csv_folder}/{filename}") as file:
                csvreader = csv.reader(file)
                csv.field_size_limit(1000000)
                transactions = [ast.literal_eval(row[0]) for row in csvreader]
                yield transactions

In [3]:
def collect_all_unique_items(csv_folder):
    unique_items = set()
    for transactions in read_transactions(csv_folder):
        for transaction in transactions:
            unique_items.update(transaction)
    return unique_items

In [4]:
# Step 2: Candidate Generation
def generate_candidate_itemsets(unique_items):
    return set([(item,) for item in unique_items])

def generate_candidate_itemsets_from_previous(frequent_itemsets, size):
    all_items = set()
    for itemset in frequent_itemsets:
        all_items.update(itemset)
    return set(combinations(all_items, size))

# Step 3: Support Calculation
def calculate_support(transactions, candidates):
    candidate_counts = defaultdict(int)
    for transaction in transactions:
        for candidate in candidates:
            if set(candidate).issubset(transaction):
                candidate_counts[candidate] += 1
    return candidate_counts

# Step 4: Pruning Candidates
def prune_candidates(candidate_counts, total_transactions, min_support):
    pruned_itemsets = {}
    for candidate, count in candidate_counts.items():
        support = count / total_transactions
        if support >= min_support:
            pruned_itemsets[candidate] = support

    return pruned_itemsets

# Step 5: Saving the Frequent Itemsets
def save_frequent_itemsets_csv(frequent_itemsets, output_file):
    # Convert the frequent itemsets dictionary into a list of tuples
    itemsets_list = [(list(itemset), support) for itemset, support in frequent_itemsets.items()]

    # Create a DataFrame from this list
    itemsets_df = pd.DataFrame(itemsets_list, columns=['Itemset', 'Support'])

    # Save the DataFrame to a CSV file
    itemsets_df.to_csv(output_file, index=False)

In [5]:
# Main Apriori Function
def apriori(csv_folder, max_length, min_support, output_file):
    all_unique_items = collect_all_unique_items(csv_folder)
    print(f'Unique Items: {all_unique_items}')
    all_frequent_itemsets = {}

    for k in range(1, max_length + 1):
        
        print(f"== Generating {k} Itemsets ==")
        if k == 1:
            candidate_itemsets = generate_candidate_itemsets(all_unique_items)
        else:
            candidate_itemsets = generate_candidate_itemsets_from_previous(all_frequent_itemsets, k)
        print(f'Candidate Itemsets for k={k}: {candidate_itemsets}')
        print(f"== {k} Frequent Itemsets generated ==")

        candidate_counts = defaultdict(int)
        total_transactions = 0

        print(f"== Calculating support for k={k} ==")
        for transactions in read_transactions(csv_folder):
            local_counts = calculate_support(transactions, candidate_itemsets)  
            for candidate, count in local_counts.items():
                candidate_counts[candidate] += count
            total_transactions += len(transactions)

        # Calculate global support and prune
        frequent_itemsets =  prune_candidates(candidate_counts, total_transactions, min_support)

        print(f"== Calculated support ==")
        
        if not frequent_itemsets:
            break

        all_frequent_itemsets.update(frequent_itemsets)
        print(f"== Finished {k} Frequent Itemsets ==")

    save_frequent_itemsets_csv(all_frequent_itemsets, output_file)
    return all_frequent_itemsets

# Run the Apriori algorithm
max_length = 2
min_support = 0.01

final_frequent_itemsets = apriori(
    csv_folder='../Dataset/transactions_split_chunks/', #EDIT THIS TO YOUR CHUNK FOLDER
    max_length=max_length, 
    min_support=min_support, 
    output_file=f'../Frequent Itemsets/transactions_{min_support}_{max_length}.csv',
)
print("Done")

Unique Items: {'Bills and Utilities', 'Tax', 'Entertainment', 'Health', 'Clothing', 'Education', 'Fines', 'Gambling', 'Motor/Travel', 'Housing', 'Savings', 'Groceries'}
== Generating 1 Itemsets ==
Candidate Itemsets for k=1: {('Housing',), ('Education',), ('Fines',), ('Bills and Utilities',), ('Clothing',), ('Groceries',), ('Gambling',), ('Entertainment',), ('Health',), ('Motor/Travel',), ('Savings',), ('Tax',)}
== 1 Frequent Itemsets generated ==
== Calculating support for k=1 ==
== Calculated support ==
== Finished 1 Frequent Itemsets ==
== Generating 2 Itemsets ==
Candidate Itemsets for k=2: {('Bills and Utilities', 'Clothing'), ('Clothing', 'Gambling'), ('Housing', 'Savings'), ('Bills and Utilities', 'Fines'), ('Bills and Utilities', 'Health'), ('Fines', 'Savings'), ('Bills and Utilities', 'Entertainment'), ('Entertainment', 'Groceries'), ('Education', 'Gambling'), ('Bills and Utilities', 'Savings'), ('Motor/Travel', 'Savings'), ('Health', 'Gambling'), ('Gambling', 'Motor/Travel'),