In [6]:
import pandas as pd
from collections import defaultdict
from itertools import combinations
from tqdm import tqdm

# Sampling the dataset for testing
transaction_data = pd.read_csv("courses_recommender/ui/data/clean_user_with_courses.csv")

# Convert the sampled data to transactions
def convert_to_transactions(transaction_data):
    transactions = []
    for _, row in tqdm(transaction_data.iterrows(), desc="Converting transaction data to transactions"):
        transaction = [item for item in row[1:] if pd.notnull(item)]
        transactions.append(transaction)
    return transactions

sampled_data = transaction_data.sample(n=1000)
# Convert the sampled data into transactions
transactions = convert_to_transactions(sampled_data)

# Calculate absolute minimum support
min_support = 0.03
absolute_min_support = int(min_support * len(transactions))

def eclat(transactions, absolute_min_support):
    # Step 1: Convert transactions to vertical data format
    vertical_data = defaultdict(list)
    for tid, transaction in enumerate(tqdm(transactions, desc="Converting transactions to vertical data")):
        for item in transaction:
            vertical_data[item].append(tid)

    # Step 2: Get frequent single-item itemsets
    frequent_itemsets = []
    for item, tids in tqdm(vertical_data.items(), desc="Getting frequent single-item itemsets"):
        support = len(tids)
        if support >= absolute_min_support:  # Compare with absolute minimum support
            frequent_itemsets.append(([item], tids))
    print(frequent_itemsets)
    answer = []
    answer.extend(frequent_itemsets)

    # Step 3: Generate frequent itemsets
    k = 2
    while True:
        candidates = generate_candidates(frequent_itemsets, k, absolute_min_support)
        answer.extend(candidates)
        if not candidates:
            break
        frequent_itemsets = candidates
        print(frequent_itemsets)
        k += 1

    return answer, vertical_data

def generate_candidates(frequent_itemsets, k, absolute_min_support):
    candidates = []
    itemsets = [itemset for itemset, _ in frequent_itemsets]
    for itemset_pair in tqdm(combinations(itemsets, 2), desc=f"Generating candidates (k={k})"):
        itemset1, itemset2 = itemset_pair
        union_itemset = set(itemset1) | set(itemset2)
        if len(union_itemset) == k:
            candidate = list(set(sorted(union_itemset)))
            if candidate not in [itemset for itemset, _ in candidates]:
                tids1 = [tids for itemset, tids in frequent_itemsets if itemset == itemset1][0]
                tids2 = [tids for itemset, tids in frequent_itemsets if itemset == itemset2][0]
                candidate_tids = intersect(tids1, tids2)
                if len(candidate_tids) >= absolute_min_support:  # Compare with absolute minimum support
                    candidates.append((candidate, candidate_tids))
    return candidates


def intersect(tids1, tids2):
    return [tid for tid in tids1 if tid in tids2]

def generate_association_rules(frequent_itemsets, vertical_data, min_confidence):
    association_rules = []
    for itemset, tids in tqdm(frequent_itemsets, desc="Generating association rules"):
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = list(antecedent)
                    consequent = [item for item in itemset if item not in antecedent]
                    antecedent_tids = set([tid for item in antecedent for tid in vertical_data[item]])
                    itemset_tids = set(tids)
                    confidence = len(itemset_tids) / len(antecedent_tids) if antecedent_tids else 0
                    print(f"Antecedent: {antecedent}, Consequent: {consequent}, Confidence: {confidence}")
                    if confidence >= min_confidence:
                        association_rules.append((antecedent, consequent, confidence))
    return association_rules


# Run ECLAT on the sampled transactions
frequent_itemsets, vertical_data = eclat(transactions, min_support)

# Generate association rules
association_rules = generate_association_rules(frequent_itemsets, vertical_data, absolute_min_support)

# association_rules.to_csv("association_rules.csv", index=False)

Converting transaction data to transactions: 1000it [00:00, 10130.04it/s]
Converting transactions to vertical data: 100%|██████████| 1000/1000 [00:00<00:00, 769738.30it/s]
Getting frequent single-item itemsets: 100%|██████████| 999/999 [00:00<00:00, 1304111.33it/s]


[(['PY0101EN, ML0115EN, ML0101ENv3, SC0101EN, DA0101EN'], [0]), (['BD0211EN, SC0103EN, ML0101EN, SC0105EN, SC0101EN'], [1]), (['PY0101EN, BD0101EN, ML0101ENv3, CO0101EN, DS0101EN, BD0111EN, ST0101EN, CO0201EN'], [2]), (['DV0101EN, DS0105EN, PY0101EN, DS0101EN, DA0101EN, DS0103EN'], [3]), (['CO0101EN, ML0103EN, PY0101EN, DS0101EN, BC0101EN, BD0111EN, CO0201EN, CO0401EN'], [4]), (['DB0101EN, DS0105EN, DS0301EN, DS0101EN, DS0103EN'], [5]), (['ML0115EN, CB0103EN, CL0101EN, DS0101EN, PY0101EN, DS0105EN'], [6]), (['CO0301EN, CC0101EN, CO0401EN, CO0101EN, CC0201EN, CO0201EN'], [7]), (['BD0101EN, BD0111EN, DS0301EN, DS0101EN, CB0103EN'], [8]), (['DS0105EN, BD0101EN, DS0101EN, ML0120ENv2, DS0103EN, ML0115EN, RP0101EN, PY0101EN'], [9]), (['BD0211EN, DS0101EN, PY0101EN, BD0101EN, BD0111EN'], [10]), (['BC0201EN, DB0151EN, PY0101EN, ML0201EN, BC0101EN, CO0201EN, SW0201EN, DA0101EN, CO0101EN, CB0103EN'], [11]), (['DS0101EN, CC0101EN, CC0201EN, CO0401EN, CO0301EN, PY0101EN'], [12]), (['ST0101EN, BD01

Generating candidates (k=2): 15971it [00:02, 6601.67it/s]


KeyboardInterrupt: 

In [5]:
print(frequent_itemsets)

[(['DV0101EN, ML0101ENv3, ST0101EN, DS0105EN, PY0101EN, DA0101EN'], [0]), (['CL0101EN, BD0101EN, TA0105, ML0115EN, DS0301EN, ML0101EN'], [1]), (['DS0103EN, BD0111EN, DS0101EN, BD0211EN, DS0105EN'], [2]), (['BD0101EN, CC0103EN, CO0101EN, ML0101ENv3, BD0143EN, BD0211EN, DS0103EN, CB0103EN, BD0153EN, ML0115EN, DS0105EN, ML0120ENv2, CC0101EN, BD0115EN, BD0111EN, ST0101EN, BC0101EN, BD0141EN, PY0101EN, SC0101EN, DA0101EN, CC0150EN, RP0105EN, CC0201EN, CO0201EN, DV0101EN, CC0210EN, RP0101EN, BD0121EN, DS0101EN, WA0101EN'], [3]), (['BC0101EN, CC0103EN, DS0101EN, CL0101EN, CC0101EN, BD0101EN'], [4]), (['PY0101EN, DV0101EN, CC0101EN, TA0105EN, ST0101EN, DA0101EN, DS0103EN'], [5]), (['ML0201EN, CB0103EN, DS0105EN, PY0101EN, CC0101EN, DS0101EN, DA0101EN, ST0101EN, DS0103EN'], [6]), (['DS0101EN, RP0101EN, ML0101EN, TA0105, DS0103EN, BD0101EN, TA0106EN, ST0101EN, BD0111EN'], [7]), (['DA0101EN, DV0101EN, BC0202EN, PY0101EN, ML0120ENv2, DB0101EN'], [8]), (['DS0103EN, BD0111EN, DS0105EN, DS0101EN, ST0

In [2]:
import pandas as pd
from collections import defaultdict
from itertools import combinations
from tqdm import tqdm

# Sample the dataset for testing (make sure your dataset is correctly loaded)
transaction_data = pd.read_csv("courses_recommender/ui/data/clean_user_with_courses.csv")

# Convert the sampled data to transactions
def convert_to_transactions(transaction_data):
    # Split the 'item' column into individual items (courses)
    transactions = transaction_data['item'].apply(lambda x: x.split(', '))
    return transactions

# Convert the sampled data into transactions
transactions = convert_to_transactions(transaction_data.sample(n=1000))

# Calculate absolute minimum support
min_support = 0.03
absolute_min_support = int(min_support * len(transactions))

def eclat(transactions, absolute_min_support):
    # Step 1: Convert transactions to vertical data format
    vertical_data = defaultdict(set)
    for tid, transaction in enumerate(tqdm(transactions, desc="Converting transactions to vertical data")):
        for item in transaction:
            vertical_data[item].add(tid)

    # Step 2: Get frequent single-item itemsets
    frequent_itemsets = []
    for item, tids in tqdm(vertical_data.items(), desc="Getting frequent single-item itemsets"):
        if len(tids) >= absolute_min_support:
            frequent_itemsets.append(([item], tids))
    answer = frequent_itemsets.copy()

    # Step 3: Generate frequent itemsets
    k = 2
    while True:
        candidates = generate_candidates(frequent_itemsets, k, absolute_min_support)
        if not candidates:
            break
        answer.extend(candidates)
        frequent_itemsets = candidates
        k += 1

    return answer, vertical_data

def generate_candidates(frequent_itemsets, k, absolute_min_support):
    candidates = []
    itemsets = [itemset for itemset, _ in frequent_itemsets]
    for itemset1, itemset2 in tqdm(combinations(itemsets, 2), desc=f"Generating candidates (k={k})"):
        union_itemset = tuple(sorted(set(itemset1) | set(itemset2)))
        if len(union_itemset) == k:
            tids1 = [tids for itemset, tids in frequent_itemsets if itemset == itemset1][0]
            tids2 = [tids for itemset, tids in frequent_itemsets if itemset == itemset2][0]
            candidate_tids = tids1 & tids2  # Using set intersection
            if len(candidate_tids) >= absolute_min_support:
                candidates.append((union_itemset, candidate_tids))
    return candidates

def generate_association_rules(frequent_itemsets, vertical_data, min_confidence):
    association_rules = []
    for itemset, tids in tqdm(frequent_itemsets, desc="Generating association rules"):
        if len(itemset) > 1:
            for i in range(1, len(itemset)):
                for antecedent in combinations(itemset, i):
                    antecedent = set(antecedent)
                    consequent = set(itemset) - antecedent
                    antecedent_tids = set.intersection(*[vertical_data[item] for item in antecedent])
                    confidence = len(tids) / len(antecedent_tids) if antecedent_tids else 0
                    if confidence >= min_confidence:
                        association_rules.append((list(antecedent), list(consequent), confidence))
    return association_rules

# Run ECLAT on the sampled transactions
frequent_itemsets, vertical_data = eclat(transactions, absolute_min_support)

# Generate association rules
association_rules = generate_association_rules(frequent_itemsets, vertical_data, min_confidence=0.5)
print(association_rules)

# Save rules to CSV
rules_df = pd.DataFrame(association_rules, columns=["Antecedent", "Consequent", "Confidence"])
rules_df.to_csv("association_rules.csv", index=False)


Converting transactions to vertical data: 100%|██████████| 1000/1000 [00:00<00:00, 206942.17it/s]
Getting frequent single-item itemsets: 100%|██████████| 112/112 [00:00<00:00, 320001.40it/s]
Generating candidates (k=2): 1711it [00:00, 34954.75it/s]
Generating candidates (k=3): 87571it [00:01, 81763.52it/s] 
Generating candidates (k=4): 4178687it [01:27, 47530.21it/s] 


KeyboardInterrupt: 