In [2]:
# Step 1: Upload and Read the Dataset

from google.colab import files
import pandas as pd
import itertools
import io

In [3]:
# Upload dataset
uploaded = files.upload()


Saving 8)apriori.csv to 8)apriori.csv


In [4]:
# Get the filename and file content
filename = next(iter(uploaded))
content = uploaded[filename]


In [5]:
# Read the file using io.BytesIO and the appropriate encoding
if filename.endswith(".xlsx"):
    # If it's an Excel file, use pd.read_excel
    df = pd.read_excel(io.BytesIO(content))  # Use io.BytesIO to treat content as a file-like object
else:
    # If it's a CSV file, use pd.read_csv with the correct encoding
    df = pd.read_csv(io.BytesIO(content), encoding='latin1')  # Try 'latin1' encoding

# Preview the data
df.head(20)


Unnamed: 0,Transaction_ID,Items _Bought
0,1,Bread; Milk
1,2,Bread; Butter
2,3,Milk; Butter
3,4,Bread; Milk; Butter
4,5,Sugar; Tea; Coffee
5,6,Bread; Butter
6,7,Milk; Sugar
7,8,Tea; Coffee
8,9,Bread; Milk; Butter; Sugar
9,10,Bread; Sugar


In [6]:
# Function to calculate support of an itemset
def get_support(itemset, transactions):
    count = sum(1 for transaction in transactions if itemset.issubset(set(transaction)))
    return count / len(transactions)


In [12]:
# Apriori algorithm
def apriori(transactions, min_support):
    itemsets = {}
    frequent_itemsets = []

    # Step 3.1: Generate 1-itemsets with support
    items = set(itertools.chain.from_iterable(transactions))
    itemsets[1] = {frozenset([item]): get_support(frozenset([item]), transactions) for item in items}

    # Step 3.2: Remove infrequent itemsets
    # This step is now moved inside the apriori function
    itemsets[1] = {k: v for k, v in itemsets[1].items() if v >= min_support}
    frequent_itemsets.extend(itemsets[1].keys())

    # Step 3.3: Generate k-itemsets
    k = 2
    while itemsets.get(k - 1):
        candidates = set(
            frozenset(a | b)
            for a in itemsets[k - 1]
            for b in itemsets[k - 1]
            if len(a | b) == k
        )
        itemsets[k] = {itemset: get_support(itemset, transactions) for itemset in candidates}

        # Remove infrequent itemsets
        itemsets[k] = {itemset: v for itemset, v in itemsets[k].items() if v >= min_support}
        frequent_itemsets.extend(itemsets[k].keys())

        k += 1

    return itemsets # or return frequent_itemsets, depending on what you need

In [13]:
# Function to generate association rules
def generate_rules(frequent_itemsets, transactions, min_confidence, min_lift):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        for i in range(1, len(itemset)):
            for antecedent in itertools.combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent

                support_itemset = get_support(itemset, transactions)
                support_antecedent = get_support(antecedent, transactions)
                support_consequent = get_support(consequent, transactions)

                confidence = support_itemset / support_antecedent if support_antecedent > 0 else 0
                lift = confidence / support_consequent if support_consequent > 0 else 0

                rules.append((antecedent, consequent, support_itemset, confidence, lift))
    return rules


In [18]:
# Parameters (adjust if needed)
min_support = 0.15   # Lower threshold to find more frequent items
min_confidence = 0.5
min_lift = 1.2  # Set 1.0 if you want all, increase if you want stronger rules

# Extract transactions from the 'Items _Bought' column (NOTE: semicolon-separated!)
transactions = df['Items _Bought'].str.split(';').apply(lambda x: [item.strip() for item in x]).tolist()

# Run Apriori Algorithm
frequent_itemsets = apriori(transactions, min_support)

# Collect all frequent itemsets
all_frequent_itemsets = [itemset for k, itemsets_k in frequent_itemsets.items() for itemset in itemsets_k.keys()]

# Generate rules from frequent itemsets
all_rules = generate_rules(all_frequent_itemsets, transactions, min_confidence, min_lift)

# Print all possible association rules
print("\nAll Possible Association Rules:")
for rule in all_rules:
    print(f"{set(rule[0])} => {set(rule[1])} (Support: {rule[2]:.2f}, Confidence: {rule[3]:.2f}, Lift: {rule[4]:.2f})")

# Step 7: Filter and Print Strong Association Rules
strong_rules = [rule for rule in all_rules if rule[3] >= min_confidence and rule[4] >= min_lift]

print("\nStrong Association Rules:")
for rule in strong_rules:
    print(f"{set(rule[0])} => {set(rule[1])} (Support: {rule[2]:.2f}, Confidence: {rule[3]:.2f}, Lift: {rule[4]:.2f})")



All Possible Association Rules:
{'Tea'} => {'Coffee'} (Support: 0.20, Confidence: 0.67, Lift: 2.22)
{'Coffee'} => {'Tea'} (Support: 0.20, Confidence: 0.67, Lift: 2.22)
{'Tea'} => {'Sugar'} (Support: 0.20, Confidence: 0.67, Lift: 1.48)
{'Sugar'} => {'Tea'} (Support: 0.20, Confidence: 0.44, Lift: 1.48)
{'Milk'} => {'Butter'} (Support: 0.25, Confidence: 0.56, Lift: 1.23)
{'Butter'} => {'Milk'} (Support: 0.25, Confidence: 0.56, Lift: 1.23)
{'Milk'} => {'Bread'} (Support: 0.25, Confidence: 0.56, Lift: 1.11)
{'Bread'} => {'Milk'} (Support: 0.25, Confidence: 0.50, Lift: 1.11)
{'Coffee'} => {'Sugar'} (Support: 0.20, Confidence: 0.67, Lift: 1.48)
{'Sugar'} => {'Coffee'} (Support: 0.20, Confidence: 0.44, Lift: 1.48)
{'Milk'} => {'Sugar'} (Support: 0.15, Confidence: 0.33, Lift: 0.74)
{'Sugar'} => {'Milk'} (Support: 0.15, Confidence: 0.33, Lift: 0.74)
{'Bread'} => {'Butter'} (Support: 0.30, Confidence: 0.60, Lift: 1.33)
{'Butter'} => {'Bread'} (Support: 0.30, Confidence: 0.67, Lift: 1.33)
{'Milk'