In [None]:
from scipy.io import arff
from collections import Counter
from itertools import combinations
import numpy as np

# Load ARFF file
def loadarff(filename):
    data, meta = arff.loadarff(filename)
    attribute_names = meta.names()

    return data, attribute_names

# Convert ARFF data to list format
def convertarff(data, attribute_names):
    data_list = []
    for row in data:
        row_data = [item.decode('utf-8') if isinstance(item, bytes) else item for item in row]
        data_list.append(row_data)
        
    return data_list


def findminsup(data_list, attribute_names):
    counte = Counter()
    for row in data_list:
        for item in row:
            counte[item] += 1

    # Calculate the mean and standard deviation of the support values
    # Assuming 'counterr' is a Counter object containing the frequencies of itemsets
    frequencies = list(counte.values())

    # Calculate mean and standard deviation
    mean_support = np.mean(frequencies)
    std_deviation = np.std(frequencies)

    minsup_threshold = mean_support - std_deviation
    # mins = int(minsup_threshold * len(data_list))


    # print("minsup is", minsup_threshold)
    minsup = 0.5
    mins = int(minsup * len(data_list))
    print(mins)
    return mins

def apriori(data_list, attribute_names, mins):
    
    counterr=Counter()
    for row in data_list:
        for attribute_name, item in zip(attribute_names, row):
            counterr[(attribute_name, item)] += 1  # Store the attribute name along with the value

    # check items li fi counterr if theyre freq yajourithm to "l"
    l = Counter()
    for (attribute_name, item) in counterr:
        if counterr[(attribute_name, item)] >= mins:
            l[frozenset([item])] += counterr[(attribute_name, item)]

    # Print the frequent itemsets of length 1 (L1) along with attribute names
    print("\nFrequent Itemsets of Length 1 (L1):")
    for itemset in l:
        # Get the attribute name and value from the itemset
        item = next(iter(itemset))
        print(f"{item} = {l[itemset]}")
    print()

    itemsets = l   # itemsets stores frequent itemsets
    pos = 1  # to track length of current itemset
    for count in range(2, 1000):
        genCand = set()  # to combine and generate candidate itemsets
        temp = list(l)
        for i in range(0, len(temp)):
            for j in range(i+1, len(temp)):
                t = temp[i].union(temp[j])  # its combining all possible combinations and checking if len = current count
                if(len(t) == count):
                    genCand.add(temp[i].union(temp[j]))  # if so, add to genCand
        genCand = list(genCand)
        counterr = Counter()
        for i in genCand:
            counterr[i] = 0
            for q in data_list:
                temp = set(q)
                if i.issubset(temp):
                    counterr[i] += 1

    

        l = Counter()
        for i in counterr:
            if counterr[i] >= mins:
                l[i] += counterr[i]

        print("L" + str(count) + ":")
        for i in l:
            print(str(list(i)) + ": " + str(l[i]))
        print()
        if len(l) == 0:
            break
        itemsets = l
        pos = count

    print("Result: ")
    print("L" + str(pos) + ":")
    for i in itemsets:
        print(str(list(i)) + ": " + str(itemsets[i]))
    print()
    
    return itemsets

# association rules extaction

def association_rules(data_list, itemsets):
    for itemset in itemsets:
        l = itemset
        c = [frozenset(q) for q in combinations(l, len(l) - 1)]
        mmax = 0
        for a in c:
            b = set(l) - a

            ab = a.union(b)
            sab = 0
            sa = 0
            sb = 0
            for q in data_list:
                temp = set(q)
                if a.issubset(temp):
                    sa += 1
                if b.issubset(temp):
                    sb += 1
                if ab.issubset(temp):
                    sab += 1
            if sa != 0:
                temp = sab / sa * 100
                if temp > mmax:
                    mmax = temp
                print(f"{list(a)} -> {list(b)} = {temp:.2f}%")
            if sb != 0:
                temp = sab / sb * 100
                if temp > mmax:
                    mmax = temp
                print(f"{list(b)} -> {list(a)} = {temp:.2f}%")
        curr = 1
        print("choosing:", end=' ')
        for a in c:
            b = set(l) - a

            ab = a.union(b)
            sab = 0
            sa = 0
            sb = 0
            for q in data_list:
                temp = set(q)
                if a.issubset(temp):
                    sa += 1
                if b.issubset(temp):
                    sb += 1
                if ab.issubset(temp):
                    sab += 1
            if sa != 0 and sab / sa * 100 == mmax:
                print(curr, end=' ')
            curr += 1
            if sb != 0 and sab / sb * 100 == mmax:
                print(curr, end=' ')
            curr += 1
        print()
        print()

        
        
def main():
    file_path = 'titanicsurvivors.arff'
    data, attribute_names = loadarff(file_path)
    data_list = convertarff(data, attribute_names)
    mins = findminsup(data_list,attribute_names)
    itemsets = apriori(data_list, attribute_names, mins)
    association_rules(data_list, itemsets)

if __name__ == "__main__":
    main()

In [None]:
from scipy.io import arff
from collections import Counter
from itertools import combinations
import numpy as np
import time

# Load ARFF file
def loadarff(filename):
    data, meta = arff.loadarff(filename)
    attribute_names = meta.names()

    return data, attribute_names

# Convert ARFF data to list format
def convertarff(data, attribute_names):
    data_list = []
    for row in data:
        row_data = [item.decode('utf-8') if isinstance(item, bytes) else item for item in row]
        data_list.append(row_data)
        
    return data_list



def apriori(data_list, attribute_names, minsup):
    
    mins = int(minsup * len(data_list))
    counterr=Counter()
    for row in data_list:
        for attribute_name, item in zip(attribute_names, row):
            counterr[(attribute_name, item)] += 1  # Store the attribute name along with the value

    # check items li fi counterr if theyre freq yajourithm to "l"
    l = Counter()
    for (attribute_name, item) in counterr:
        if counterr[(attribute_name, item)] >= mins:
            l[frozenset([item])] += counterr[(attribute_name, item)]

    # Print the frequent itemsets of length 1 (L1) along with attribute names
    # print("\nFrequent Itemsets of Length 1 (L1):")
    # for itemset in l:
    #     # Get the attribute name and value from the itemset
    #     item = next(iter(itemset))
    #     print(f"{item} = {l[itemset]}")
    # print()

    itemsets = {1: l}   # itemsets stores frequent itemsets by length
    pos = 1  # to track length of current itemset
    for count in range(2, 1000):
        genCand = set()  # to combine and generate candidate itemsets
        temp = list(l)
        for i in range(0, len(temp)):
            for j in range(i+1, len(temp)):
                t = temp[i].union(temp[j])  # its combining all possible combinations and checking if len = current count
                if(len(t) == count):
                    genCand.add(temp[i].union(temp[j]))  # if so, add to genCand
        genCand = list(genCand)
        counterr = Counter()
        for i in genCand:
            counterr[i] = 0
            for q in data_list:
                temp = set(q)
                if i.issubset(temp):
                    counterr[i] += 1

    

        l = Counter()
        for i in counterr:
            if counterr[i] >= mins:
                l[i] += counterr[i]

        # print("L" + str(count) + ":")
        # for i in l:
        #     print(str(list(i)) + ": " + str(l[i]))
        # print()
        if len(l) == 0:
            break
        itemsets[count] = l
        pos = count

    # print("Result: ")
    # for length, freq_itemsets in itemsets.items():
    #     for itemset, support in freq_itemsets.items():
    #         print(f"L{length}: {list(itemset)} - Support: {support}")

    return itemsets



# nnjibo minsup
def adjust_minsup_based_on_performance(data_list,attribute_names, initial_minsup, max_iterations):
    minsup = initial_minsup
    iteration = 0
    best_itemsets = None
    while iteration < max_iterations:
        
        
        start_time = time.time()
        frequent_itemsets = apriori(data_list, attribute_names, minsup)
        execution_time = time.time() - start_time
        num_itemsets = sum(len(itemset) for itemset in frequent_itemsets.values())


        print("Iteration:", iteration + 1)
        print("MinSup:", minsup * len(data_list))
        print("Execution time:", execution_time)
        print("Number of itemsets:", num_itemsets)

        if execution_time > 0.05:
            minsup *= 1.1
        elif execution_time < 0.04 and num_itemsets < 30:
            minsup *= 0.9
        else:
            best_itemsets = frequent_itemsets
            break  # Exit the loop if the conditions are met

        iteration += 1

    if best_itemsets is None:
        best_itemsets = frequent_itemsets  # Use the last frequent itemsets found
        

    print("Best choice for minsup:", minsup * len(data_list))
    
    print("Result: ")
    for length, freq_itemsets in best_itemsets.items():
        for itemset, support in freq_itemsets.items():
            print(f"L{length}: {list(itemset)} - Support: {support}")
            
    return best_itemsets


# association rules extaction

def association_rules(data_list, itemsets):
    for length, freq_itemsets in itemsets.items():
        if length == 1:  # Skip itemsets of length 1
            continue
        for itemset in freq_itemsets:
            l = list(itemset)
            c = [frozenset(q) for q in combinations(l, len(l) - 1)]
            mmax = 0
            for a in c:
                b = set(l) - a

                ab = a.union(b)
                sab = 0
                sa = 0
                sb = 0
                for q in data_list:
                    temp = set(q)
                    if a.issubset(temp):
                        sa += 1
                    if b.issubset(temp):
                        sb += 1
                    if ab.issubset(temp):
                        sab += 1
                if sa != 0:
                    temp = sab / sa * 100
                    if temp > mmax:
                        mmax = temp
                    print(f"{list(a)} -> {list(b)} = {temp:.2f}%")
                if sb != 0:
                    temp = sab / sb * 100
                    if temp > mmax:
                        mmax = temp
                    print(f"{list(b)} -> {list(a)} = {temp:.2f}%")
            curr = 1
            print("choosing:", end=' ')
            for a in c:
                b = set(l) - a

                ab = a.union(b)
                sab = 0
                sa = 0
                sb = 0
                for q in data_list:
                    temp = set(q)
                    if a.issubset(temp):
                        sa += 1
                    if b.issubset(temp):
                        sb += 1
                    if ab.issubset(temp):
                        sab += 1
                if sa != 0 and sab / sa * 100 == mmax:
                    print(curr, end=' ')
                curr += 1
                if sb != 0 and sab / sb * 100 == mmax:
                    print(curr, end=' ')
                curr += 1
            print()
            print()


        
        
def main():
    
    initial_minsup = 0.5
    max_iterations = 20
    file_path = 'titanicsurvivors.arff'
    data, attribute_names = loadarff(file_path)
    data_list = convertarff(data, attribute_names)
    itemsets= adjust_minsup_based_on_performance(data_list,attribute_names, initial_minsup, max_iterations)
    # mins = findminsup(data_list,attribute_names)
    # itemsets = apriori(data_list, attribute_names, mins)
    association_rules(data_list, itemsets)

if __name__ == "__main__":
    main()