In [4]:
import pandas as pd
from collections import Counter
import time
# from data_test import *
# from mushroom import *
# from retail_transaction_dataset import *
# from fruithut.fruithut import *

In [5]:
# sample
data = {
    'Tid': ['T1', 'T2', 'T3', 'T4', 'T5', 'T6'],
    'Items': [['a', 'c', 'd'],
              ['a', 'b', 'd'],
              ['b', 'c', 'd', 'e'],
              ['a', 'd'],
              ['c', 'd', 'e'],
              ['a', 'b', 'c', 'd', 'e']]
}
df = pd.DataFrame(data)

# data = {
#     'Tid': ['T1', 'T2', 'T3', 'T4', 'T5', 'T6'],
#     'Items': [['apple', 'cherry', 'durian'],
#               ['apple', 'banana', 'durian'],
#               ['banana', 'cherry', 'durian', 'elderberry'],
#               ['apple', 'durian'],
#               ['cherry', 'durian', 'elderberry'],
#               ['apple', 'banana', 'cherry', 'durian', 'elderberry']]
# }

# df = pd.DataFrame(data)
df['Item_Length'] = df['Items'].apply(lambda items: len(items))
len_df = len(df)
df

Unnamed: 0,Tid,Items,Item_Length
0,T1,"[a, c, d]",3
1,T2,"[a, b, d]",3
2,T3,"[b, c, d, e]",4
3,T4,"[a, d]",2
4,T5,"[c, d, e]",3
5,T6,"[a, b, c, d, e]",5


In [6]:
unique_items = sorted(df['Items'].explode().unique()) # get unique item => save to list
unique_items

['a', 'b', 'c', 'd', 'e']

In [7]:
length_transaction = df["Items"].apply(len)
if length_transaction.nunique() == 1:
    hastheSameLengh = True
else:
    hastheSameLengh = False

hastheSameLengh

False

In [8]:
length_transaction = length_transaction.to_list()
length_transaction

[3, 3, 4, 2, 3, 5]

In [9]:
# calculate stset: {'a': [T1, T2, T4, T6]} - list Tid containing unique item
def cal_stset(df):
    stset = {} 
    for item in unique_items:
        tid_list = df[df['Items'].apply(lambda items: item in items)]['Tid'].tolist() # create column Items with items in unique item
        tid_lengths = [len(df[df['Tid'] == tid]['Items'].iloc[0]) for tid in tid_list if item in df[df['Tid'] == tid]['Items'].iloc[0]]
        stset[item] = {"StSet": tid_list, "Length_transaction": tid_lengths} # add value with item_key

    df_stset = pd.DataFrame.from_dict(stset, orient = 'index').reset_index()
    df_stset.columns = ["Items", "StSet", "Length_transaction"]
    df_stset['Items'] = df_stset['Items'].apply(lambda x: [x]) 
    return df_stset

In [10]:
# calculate support - count number of Tid containing unique item
def cal_support(df_stset):
    df_stset['Support'] = df_stset['StSet'].apply(len)
    return df_stset

In [11]:
df_stset = cal_stset(df)
df_stset

Unnamed: 0,Items,StSet,Length_transaction
0,[a],"[T1, T2, T4, T6]","[3, 3, 2, 5]"
1,[b],"[T2, T3, T6]","[3, 4, 5]"
2,[c],"[T1, T3, T5, T6]","[3, 4, 3, 5]"
3,[d],"[T1, T2, T3, T4, T5, T6]","[3, 3, 4, 2, 3, 5]"
4,[e],"[T3, T5, T6]","[4, 3, 5]"


In [12]:
df_stset = cal_support(df_stset)
df_stset

Unnamed: 0,Items,StSet,Length_transaction,Support
0,[a],"[T1, T2, T4, T6]","[3, 3, 2, 5]",4
1,[b],"[T2, T3, T6]","[3, 4, 5]",3
2,[c],"[T1, T3, T5, T6]","[3, 4, 3, 5]",4
3,[d],"[T1, T2, T3, T4, T5, T6]","[3, 3, 4, 2, 3, 5]",6
4,[e],"[T3, T5, T6]","[4, 3, 5]",3


In [13]:
# ex: 'a': {'l(a)': [2, 3, 5], 'n(a)': [1, 2, 1]}
def df_prepare_UBO(df_stset):
    l_item_list = []
    n_item_list = []
    for index, row in df_stset.iterrows():
        item = row['Items']
        length_transaction = row['Length_transaction']

        l_item = sorted(set(length_transaction)) # get unique len(Tid) => sort ascending

        counter = Counter(length_transaction)
        n_item = [counter[i] for i in l_item] # count unique len(Tid) in occupancy_list => same index with l_item
        
        l_item_list.append(l_item)
        n_item_list.append(n_item)
    
    df_stset = df_stset.assign(l_item=l_item_list, n_item=n_item_list)
    return df_stset

In [14]:
# calculate occupancy - O(P) = ∑ T ∈ STSet(P) |P|/|T|
# |P|: len(unique item) itemset {a} =>1
# |T|: len(Tid) 1/3 + 1/3 + 1/2 + 1/5 
def cal_occupancy(df_stset):
    occupancy_data = []
    for index, row in df_stset.iterrows():
        item = row['Items']
        length_transaction = row['Length_transaction']
        total = 0
        for length in length_transaction:
            total += len(item) / length
        occupancy_data.append({'Items': item, 'Occupancy': round(total, 2)})
    
    df_occupancy = pd.DataFrame(occupancy_data)
    df_stset['Occupancy'] = df_occupancy['Occupancy']
    return df_stset

In [15]:
df_stset = cal_occupancy(df_stset)
df_stset

Unnamed: 0,Items,StSet,Length_transaction,Support,Occupancy
0,[a],"[T1, T2, T4, T6]","[3, 3, 2, 5]",4,1.37
1,[b],"[T2, T3, T6]","[3, 4, 5]",3,0.78
2,[c],"[T1, T3, T5, T6]","[3, 4, 3, 5]",4,1.12
3,[d],"[T1, T2, T3, T4, T5, T6]","[3, 3, 4, 2, 3, 5]",6,1.95
4,[e],"[T3, T5, T6]","[4, 3, 5]",3,0.78


In [16]:
# calculate according to the formula: ni x lx/li
def cal_ubo(l, n):
    total = 0
    for i in range(len(l)):
        total += n[i] * l[0] / l[i]
    return round(total, 2)

In [17]:
# summarize: ∑ni x lx/li => save to list 
def ubo_final(length, number_transaction):
    ubo = []
    for i in range(len(length)): 
        # ex: len = [2,3,5], num_trans = [1,2,1]
        # i = 0 => len = [2,3,5], num_trans = [1,2,1]
        # i = 1 => len = [3,5], num_trans = [2,1]
        # ...
        ubo.append(cal_ubo(length[i:], number_transaction[i:])) # save result cal_ubo for each i => get maxUBO
    return ubo

In [18]:
# get max from summarize => save max value in UBO by key
def calculate_maxUBO(df_UBO):
    df_UBO['List_UBO'] = None # create new column
    df_UBO['Max_UBO'] = None # create new column
    for index, row in df_UBO.iterrows():
        length = row['l_item'] #get list of len(Tid) containing unique item
        number_transaction = row['n_item'] # count unique len(Tid) in occupancy_list
        
        ubo = ubo_final(length, number_transaction) # get list of UBO by i. ex: [2.73, 2.6, 1.0]
        max_ubo = max(ubo) # max list of UBO
        
        df_UBO.at[index, 'List_UBO'] = ubo # save result in df
        df_UBO.at[index, 'Max_UBO'] = max_ubo # save result in df
        
    return df_UBO

In [19]:
# UBO calculation methods: main function
def cal_UBO(df_stset): 
    df_stset = df_prepare_UBO(df_stset)    
    df_stset = calculate_maxUBO(df_stset)
    return df_stset

In [20]:
df_stset = cal_UBO(df_stset)
df_stset

Unnamed: 0,Items,StSet,Length_transaction,Support,Occupancy,l_item,n_item,List_UBO,Max_UBO
0,[a],"[T1, T2, T4, T6]","[3, 3, 2, 5]",4,1.37,"[2, 3, 5]","[1, 2, 1]","[2.73, 2.6, 1.0]",2.73
1,[b],"[T2, T3, T6]","[3, 4, 5]",3,0.78,"[3, 4, 5]","[1, 1, 1]","[2.35, 1.8, 1.0]",2.35
2,[c],"[T1, T3, T5, T6]","[3, 4, 3, 5]",4,1.12,"[3, 4, 5]","[2, 1, 1]","[3.35, 1.8, 1.0]",3.35
3,[d],"[T1, T2, T3, T4, T5, T6]","[3, 3, 4, 2, 3, 5]",6,1.95,"[2, 3, 4, 5]","[1, 3, 1, 1]","[3.9, 4.35, 1.8, 1.0]",4.35
4,[e],"[T3, T5, T6]","[4, 3, 5]",3,0.78,"[3, 4, 5]","[1, 1, 1]","[2.35, 1.8, 1.0]",2.35


In [21]:
def mine_hoi_1_itemset(threshold, hastheSameLengh, df_stset):
    C1 = []
    HOI1 = []
    for index, row in df_stset.iterrows():
        item = row['Items'] # 1-itemset in row
        support = row['Support'] # support of 1-itemset
        occupancy = row['Occupancy'] # occuopancy of 1-itemset
        max_ubo = row['Max_UBO'] # max_ubo of 1-itemset
        
        if support >= threshold:
            if hastheSameLengh is False:
                if max_ubo >= threshold:
                    C1.append(item)
                    if occupancy >= threshold:
                        HOI1.append(item)
            else:
                C1.append(item)
                if occupancy >= threshold:
                    HOI1.append(item)
    
    return C1, HOI1

In [22]:
def df_intersection(items1, items2, df_stset):    
    df_intersection = pd.DataFrame(columns=['Items', 'StSet'])
    # set1 = set(items1)
    # set2 = set(items2)
    list_item = sorted(list(set(items1) | set(items2)))
    
    list_occupancy_item = []
    
    for i in list_item:
        list_occupancy_item.append(df_stset[df_stset['Items'].apply(lambda item: i in item)]["StSet"].iloc[0])
        
    intersection_list = set(list_occupancy_item[0])
    for sublist in list_occupancy_item[1:]:
        intersection_list = intersection_list.intersection(sublist)

    intersection_list = sorted(intersection_list, key = lambda x: x[0])
    
    df_intersection = df_intersection.append({'Items': list_item, 'StSet': intersection_list}, ignore_index=True)
    df_intersection['Length_transaction'] = df_intersection['StSet'].apply(lambda x: [len(df[df['Tid'] == tid]['Items'].iloc[0]) for tid in x])
    return df_intersection

In [23]:
test1 = ['a']
test2 = ['b']
df_intersection(test1, test2, df_stset)

Unnamed: 0,Items,StSet,Length_transaction
0,"[a, b]","[T6, T2]","[5, 3]"


In [24]:
def is_same_equivalence_class(P1, P2):
    if len(P1) == len(P2):
        if len(P1) == 1:
            if P1 == P2:
                return False
            else:
                return True
        else:
            if P1 == P2:
                return False
            else:
                new_P1 = P1[:-1]
                new_P2 = P2[:-1]
                if new_P1 == new_P2:
                    return True
                else:
                    return False
    else:
        return False
        # else:
        #     intersection_P1_P2 = sorted(list(set(P1) & set(P2)))
        #     print(intersection_P1_P2)
        #     if len(intersection_P1_P2) == len(P1) - 1:
        #         for element in intersection_P1_P2:
        #             if P1.index(element) != P2.index(element):
        #                 return False
        #         return True
        #     else:
        #         return False

In [25]:
test1 = ['a']
test2 = ['b']
print(is_same_equivalence_class(test1, test2))

True


In [26]:
def cal_occupancy_candidate(items):    
    df_candidate = pd.DataFrame(columns=['Items', 'StSet'])
    
    list_stset_item = []
    
    for i in items:
        list_stset_item.append(df_stset[df_stset['Items'].apply(lambda item: i in item)]["StSet"].iloc[0])
        
    intersection_list = set(list_stset_item[0])
    for sublist in list_stset_item[1:]:
        intersection_list = intersection_list.intersection(sublist)
    
    intersection_list = sorted(intersection_list, key = lambda x: x[0])
    
    df_candidate = df_candidate.append({'Items': items, 'StSet': intersection_list}, ignore_index=True)
    df_candidate['Length_transaction'] = df_candidate['StSet'].apply(lambda x: [len(df[df['Tid'] == tid]['Items'].iloc[0]) for tid in x])
    df_candidate = cal_occupancy(df_candidate)
    return df_candidate
    

In [27]:
test = ['a','b','d']
df_test = cal_occupancy_candidate(test)['Occupancy'].iloc[0]
print(df_test)

1.6


In [28]:
def mine_hoi_k_itemset(threshold, hastheSameLengh, CK_minus_1, df_stset):
    CK = []
    HOIK = []
    
    while len(CK_minus_1) > 0:
        P1 = sorted(CK_minus_1[0])
        for P2 in CK_minus_1:
            sorted(P2)
            if is_same_equivalence_class(P1, P2):
                P = df_intersection(P1, P2, df_stset)
                P_items = P['Items'].iloc[0]
                P_stset = P['StSet'].iloc[0]
                if len(P_stset) >= threshold:
                    if hastheSameLengh is False:
                        P_ubo = cal_UBO(P)['Max_UBO'].iloc[0]
                        if P_ubo >= threshold:
                            CK.append(P_items)
                    else:
                        CK.append(P_items)
                        
        CK_minus_1.pop(0)
    
    for i in CK:
        if cal_occupancy_candidate(i)['Occupancy'].iloc[0] >= threshold:
            HOIK.append(i)
    
    return CK, HOIK
        
                        
    

In [29]:
#test FHOI
HOIS = []
C1 = []
HOI1 = []
CK_minus_1 = []

k = 2 # loop to create 2-itemset
threshold = 0.25 
threshold = threshold * len_df # ex: threshold = 25% of len(database)
start_time = time.time()

#create candidate 1 and HOI1 itemset
C1, HOI1 = mine_hoi_1_itemset(threshold, hastheSameLengh, df_stset)

HOIS = HOI1
CK_minus_1 = C1

while CK_minus_1:
    CK, HOIK = mine_hoi_k_itemset(threshold, hastheSameLengh, CK_minus_1, df_stset)
    print(HOIK)
    # HOIS.append(HOIK)
    CK_minus_1 = CK
    k += 1

for i in HOIS:
    print(i)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

#update try except


[['a', 'd'], ['b', 'd'], ['c', 'd'], ['c', 'e'], ['d', 'e']]
[['a', 'b', 'd'], ['a', 'c', 'd'], ['c', 'd', 'e']]
[['b', 'c', 'd', 'e']]
[]
['d']
Execution time: 0.3664286136627197 seconds


In [32]:
def mine_depth_hois(threshold, hastheSameLengh, C1, df_stset):
    HOIS = []
    for i in range(len(C1)):
        P1 = C1[i]
        C_l = []
        for j in range(i + 1, len(C1)):
            P2 = C1[j]
            P = df_intersection(P1, P2, df_stset)
            P_items = P['Items'].iloc[0]
            P_stset = P['StSet'].iloc[0]
            if len(P_stset) >= threshold:
                if hastheSameLengh is False:
                    P_ubo = cal_UBO(P)['Max_UBO'].iloc[0]
                    if P_ubo >= threshold:
                        C_l.append(P_items)
                else:
                    C_l.append(P_items)
        HOIS.extend(mine_depth_hois(threshold, hastheSameLengh, C_l, df_stset))
        
        for i in C_l:
            if cal_occupancy_candidate(i)['Occupancy'].iloc[0] >= threshold:
                HOIS.append(i)
    
    return HOIS
        
                        

In [33]:
#test DFHOI
HOIS = []
C1 = []
HOI1 = []
CK_minus_1 = []

k = 2 # loop to create 2-itemset
threshold = 0.25 
threshold = threshold * len_df # ex: threshold = 25% of len(database)
start_time = time.time()

#create candidate 1 and HOI1 itemset
C1, HOI1 = mine_hoi_1_itemset(threshold, hastheSameLengh, df_stset)

HOIS = HOI1

while C1:
    HOIS.append(mine_depth_hois(threshold, hastheSameLengh, C1, df_stset))
    
for i in HOIS:
    print(i)

KeyboardInterrupt: 