In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

MIN_SUPPORT = 0.018
MIN_CONFIDENT = 0.6

In [2]:
raw_data = pd.read_csv('./data/apriori_data.csv')
'''
Data Ingestion:
3898 samples, 167 products
'''

'\nData Ingestion:\n3898 samples, 167 products\n'

In [3]:
def preprocess_data(raw_data):
    transactions = []
    for i in raw_data.values:
        transactions.append(list(*np.where(i==1)))              
    return transactions

def support_count(transactions, itemsets):
    candidate_set = {}
    total_transactions = len(transactions)
    for itemset in itemsets:    
        sup_count = np.sum([1 for transaction in transactions if itemset.issubset(transaction)])
        candidate_set[itemset] = sup_count / total_transactions
    return candidate_set

def filter_candidates(candidate_set, min_support):
    filtered_set = {}
    for itemset, support in candidate_set.items():
        if support >= min_support:
            filtered_set[itemset] = support
    return filtered_set

def generate_candidates_itemsets(filter_set, k_step):
    candidate_itemsets = set()
    freq_items = list(filter_set.keys())    
    
    for i in range(len(freq_items)):
        for j in range(i+1, len(freq_items)):
            itemset = freq_items[i].union(freq_items[j])           
            if len(itemset)==k_step:
                candidate_itemsets.add(itemset)
    
    return candidate_itemsets

def apriori(transactions, min_support, preprocess = True):
    # Initial frequent 1-itemsets
    if preprocess:
        transactions = preprocess_data(transactions)
    
    itemsets_c1 = set(frozenset([item]) for transaction in transactions for item in transaction)
    candidate_set = support_count(transactions, itemsets_c1)
    k_step = 2

    filter_set = filter_candidates(candidate_set, min_support=min_support)
    final_set = filter_set.copy()
    
    while filter_set:
        candidate_itemset = generate_candidates_itemsets(filter_set, k_step)
        candidate_set = support_count(transactions, candidate_itemset)
        filter_set = filter_candidates(candidate_set, min_support=min_support)
        final_set.update(filter_set)
        k_step+=1           
    
    return final_set

In [4]:
final_association = apriori(transactions=raw_data, min_support=MIN_SUPPORT)

association = []
for key in final_association.keys():
    if len(key) == 3:
        association.append(key)

result_dict = {}
for itemset in association:
# Generate all possible antecedent and consequent pairs
    for i in range(1, len(itemset)):  # Split itemset into antecedent of size 1, 2
        for antecedent in combinations(itemset, i):
            antecedent = frozenset(antecedent)
            consequent = itemset - antecedent  # Remainder of the itemset as consequent

            # Calculate confidence
            confidence = final_association.get(itemset, 0) / final_association.get(antecedent, 1)

            # Check if confidence meets min_confidence threshold
            if confidence >= MIN_CONFIDENT:
                # conf = round(confidence,2)       
                if confidence not in result_dict:
                    result_dict[confidence] = [itemset] 
                else:
                    result_dict[confidence].append(itemset)


In [5]:
result_data = []

#Generate result file
for conf in result_dict.keys():
    row = ''
    for item in result_dict[conf]:
        item_list = list(item)
        row = row + '||'
        for i in range(len(item_list)):
            row = row + '(' + 'PRODUCT' + str(item_list[i]) + ')' + '^'
    
    result_data.append([row[2:-1], conf])
    
result_df = pd.DataFrame(result_data)
result_df.to_csv('./results/result4.csv', index=False, header=False)