Roshan Gautam
University of Cumberlands
Data Mining 
Lab 6

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import warnings
import time
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== ADVANCED DATA MINING LAB 6 ===")
print("Association Rule Mining with Apriori and FP-Growth")
print("=" * 50)

# Step 1: Data Preparation
print("\n1. DATA PREPARATION")
print("-" * 20)

# Load the Online Retail dataset
try:
    df = pd.read_excel('Online_Retail.xlsx')
    print(f"Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
except FileNotFoundError:
    print("Error: Online_Retail.xlsx not found. Please download from UCI ML Repository.")
    exit()

# Display basic info about the dataset
print(f"\nDataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Data cleaning
print("\nData Cleaning:")
print(f"Missing values before cleaning:")
print(df.isnull().sum())

# Remove rows with missing CustomerID and Description
df_clean = df.dropna(subset=['CustomerID', 'Description'])

# Remove cancelled transactions (negative quantities)
df_clean = df_clean[df_clean['Quantity'] > 0]

# Remove transactions with invalid unit prices
df_clean = df_clean[df_clean['UnitPrice'] > 0]

print(f"\nAfter cleaning: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
print(f"Unique customers: {df_clean['CustomerID'].nunique()}")
print(f"Unique products: {df_clean['Description'].nunique()}")

# Step 1 Visualizations: Dataset Exploration
print("\nCreating exploratory visualizations...")

# Most frequently purchased items
plt.figure(figsize=(12, 6))
top_items = df_clean['Description'].value_counts().head(15)
plt.subplot(1, 2, 1)
sns.barplot(x=top_items.values, y=top_items.index)
plt.title('Top 15 Most Frequently Purchased Items')
plt.xlabel('Frequency')

# Transaction distribution by month
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])
df_clean['Month'] = df_clean['InvoiceDate'].dt.month
plt.subplot(1, 2, 2)
monthly_transactions = df_clean.groupby('Month')['InvoiceNo'].nunique()
sns.barplot(x=monthly_transactions.index, y=monthly_transactions.values)
plt.title('Number of Transactions by Month')
plt.xlabel('Month')
plt.ylabel('Number of Transactions')
plt.tight_layout()
plt.show()

# Prepare transaction data for market basket analysis
print("\nPreparing transaction data...")

# Filter to top N most popular items to reduce memory usage
top_n_items = 150
print(f"Filtering to top {top_n_items} most popular items for memory optimization...")
top_items = df_clean['Description'].value_counts().head(top_n_items).index.tolist()
df_filtered = df_clean[df_clean['Description'].isin(top_items)]

print(f"Filtered dataset: {df_filtered.shape[0]} rows with {len(top_items)} unique products")

# Group by InvoiceNo to create transaction baskets
transactions = df_filtered.groupby('InvoiceNo')['Description'].apply(list).tolist()

# Filter transactions with at least 2 items
transactions = [t for t in transactions if len(t) >= 2]
print(f"Number of transactions with 2+ items: {len(transactions)}")

# Convert to one-hot encoded format
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

print(f"One-hot encoded data shape: {df_encoded.shape}")
print(f"Sample of encoded data:")
print(df_encoded.head())

# Create item co-occurrence heatmap for top items
print("\nCreating item co-occurrence heatmap...")
top_20_items = df_filtered['Description'].value_counts().head(20).index
df_top_items = df_encoded[top_20_items]

# Calculate co-occurrence matrix
cooccurrence_matrix = df_top_items.T.dot(df_top_items)
np.fill_diagonal(cooccurrence_matrix.values, 0)

plt.figure(figsize=(12, 10))
sns.heatmap(cooccurrence_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=[item[:30] + '...' if len(item) > 30 else item for item in top_20_items],
            yticklabels=[item[:30] + '...' if len(item) > 30 else item for item in top_20_items])
plt.title('Item Co-occurrence Matrix (Top 20 Items)')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Step 2: Frequent Itemset Mining Using Apriori
print("\n2. FREQUENT ITEMSET MINING USING APRIORI")
print("-" * 45)

# Set support threshold (higher due to filtered dataset)
min_support = 0.02
print(f"Using minimum support threshold: {min_support}")
print(f"This means itemsets must appear in at least {int(min_support * len(df_encoded))} transactions")

# Apply Apriori algorithm
print("Running Apriori algorithm...")
start_time = time.time()
frequent_itemsets_apriori = apriori(df_encoded, min_support=min_support, use_colnames=True)
apriori_time = time.time() - start_time

print(f"Apriori execution time: {apriori_time:.4f} seconds")
print(f"Number of frequent itemsets found: {len(frequent_itemsets_apriori)}")

# Display top frequent itemsets
frequent_itemsets_apriori['length'] = frequent_itemsets_apriori['itemsets'].apply(lambda x: len(x))
print(f"\nFrequent itemsets by length:")
print(frequent_itemsets_apriori['length'].value_counts().sort_index())

# Show top 10 frequent itemsets
print(f"\nTop 10 frequent itemsets (by support):")
top_itemsets_apriori = frequent_itemsets_apriori.nlargest(10, 'support')
for idx, row in top_itemsets_apriori.iterrows():
    items = ', '.join(list(row['itemsets']))
    print(f"Support: {row['support']:.4f} | Items: {items}")

# Visualization: Top frequent itemsets
plt.figure(figsize=(12, 6))
top_15_itemsets = frequent_itemsets_apriori.nlargest(15, 'support')
itemset_labels = [', '.join(list(itemset)[:2]) + ('...' if len(itemset) > 2 else '') 
                  for itemset in top_15_itemsets['itemsets']]

sns.barplot(data=top_15_itemsets.reset_index(), y=range(len(top_15_itemsets)), x='support')
plt.yticks(range(len(top_15_itemsets)), itemset_labels)
plt.title('Top 15 Frequent Itemsets (Apriori)')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.tight_layout()
plt.show()

# Step 3: Frequent Itemset Mining Using FP-Growth
print("\n3. FREQUENT ITEMSET MINING USING FP-GROWTH")
print("-" * 47)

# Apply FP-Growth algorithm with same support threshold
print("Running FP-Growth algorithm...")
start_time = time.time()
frequent_itemsets_fpgrowth = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)
fpgrowth_time = time.time() - start_time

print(f"FP-Growth execution time: {fpgrowth_time:.4f} seconds")
print(f"Number of frequent itemsets found: {len(frequent_itemsets_fpgrowth)}")

# Display top frequent itemsets
frequent_itemsets_fpgrowth['length'] = frequent_itemsets_fpgrowth['itemsets'].apply(lambda x: len(x))
print(f"\nFrequent itemsets by length:")
print(frequent_itemsets_fpgrowth['length'].value_counts().sort_index())

# Show top 10 frequent itemsets
print(f"\nTop 10 frequent itemsets (by support):")
top_itemsets_fpgrowth = frequent_itemsets_fpgrowth.nlargest(10, 'support')
for idx, row in top_itemsets_fpgrowth.iterrows():
    items = ', '.join(list(row['itemsets']))
    print(f"Support: {row['support']:.4f} | Items: {items}")

# Visualization: Top frequent itemsets from FP-Growth
plt.figure(figsize=(12, 6))
top_15_itemsets_fp = frequent_itemsets_fpgrowth.nlargest(15, 'support')
itemset_labels_fp = [', '.join(list(itemset)[:2]) + ('...' if len(itemset) > 2 else '') 
                     for itemset in top_15_itemsets_fp['itemsets']]

sns.barplot(data=top_15_itemsets_fp.reset_index(), y=range(len(top_15_itemsets_fp)), x='support')
plt.yticks(range(len(top_15_itemsets_fp)), itemset_labels_fp)
plt.title('Top 15 Frequent Itemsets (FP-Growth)')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.tight_layout()
plt.show()

# Step 4: Generating and Analyzing Association Rules
print("\n4. GENERATING AND ANALYZING ASSOCIATION RULES")
print("-" * 48)

# Set confidence threshold
min_confidence = 0.2
print(f"Using minimum confidence threshold: {min_confidence}")

# Generate association rules from Apriori results
print("\nGenerating association rules from Apriori results...")
rules_apriori = association_rules(frequent_itemsets_apriori, metric="confidence", 
                                min_threshold=min_confidence)

if len(rules_apriori) > 0:
    print(f"Number of association rules (Apriori): {len(rules_apriori)}")
    
    # Display top rules by confidence
    print(f"\nTop 10 association rules by confidence (Apriori):")
    top_rules_apriori = rules_apriori.nlargest(10, 'confidence')
    for idx, row in top_rules_apriori.iterrows():
        antecedents = ', '.join(list(row['antecedents']))
        consequents = ', '.join(list(row['consequents']))
        print(f"Rule: {antecedents} → {consequents}")
        print(f"  Support: {row['support']:.4f}, Confidence: {row['confidence']:.4f}, Lift: {row['lift']:.4f}")
        print()

# Generate association rules from FP-Growth results
print("Generating association rules from FP-Growth results...")
rules_fpgrowth = association_rules(frequent_itemsets_fpgrowth, metric="confidence", 
                                 min_threshold=min_confidence)

if len(rules_fpgrowth) > 0:
    print(f"Number of association rules (FP-Growth): {len(rules_fpgrowth)}")
    
    # Display top rules by lift
    print(f"\nTop 10 association rules by lift (FP-Growth):")
    top_rules_fpgrowth = rules_fpgrowth.nlargest(10, 'lift')
    for idx, row in top_rules_fpgrowth.iterrows():
        antecedents = ', '.join(list(row['antecedents']))
        consequents = ', '.join(list(row['consequents']))
        print(f"Rule: {antecedents} → {consequents}")
        print(f"  Support: {row['support']:.4f}, Confidence: {row['confidence']:.4f}, Lift: {row['lift']:.4f}")
        print()

# Visualization: Confidence vs Lift scatter plot
if len(rules_apriori) > 0:
    plt.figure(figsize=(12, 8))
    
    plt.subplot(1, 2, 1)
    plt.scatter(rules_apriori['confidence'], rules_apriori['lift'], alpha=0.6, s=rules_apriori['support']*1000)
    plt.xlabel('Confidence')
    plt.ylabel('Lift')
    plt.title('Association Rules: Confidence vs Lift (Apriori)\nBubble size = Support')
    plt.axhline(y=1, color='red', linestyle='--', alpha=0.5)
    
    if len(rules_fpgrowth) > 0:
        plt.subplot(1, 2, 2)
        plt.scatter(rules_fpgrowth['confidence'], rules_fpgrowth['lift'], alpha=0.6, s=rules_fpgrowth['support']*1000)
        plt.xlabel('Confidence')
        plt.ylabel('Lift')
        plt.title('Association Rules: Confidence vs Lift (FP-Growth)\nBubble size = Support')
        plt.axhline(y=1, color='red', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()

# Step 5: Comparative Analysis
print("\n5. COMPARATIVE ANALYSIS")
print("-" * 24)

print("Performance Comparison:")
print(f"Apriori execution time: {apriori_time:.4f} seconds")
print(f"FP-Growth execution time: {fpgrowth_time:.4f} seconds")
print(f"Speed improvement: {apriori_time/fpgrowth_time:.2f}x faster with FP-Growth")

print(f"\nResults Comparison:")
print(f"Frequent itemsets found:")
print(f"  Apriori: {len(frequent_itemsets_apriori)}")
print(f"  FP-Growth: {len(frequent_itemsets_fpgrowth)}")

if len(rules_apriori) > 0 and len(rules_fpgrowth) > 0:
    print(f"Association rules generated:")
    print(f"  Apriori: {len(rules_apriori)}")
    print(f"  FP-Growth: {len(rules_fpgrowth)}")

# Algorithm efficiency visualization
plt.figure(figsize=(10, 6))

# Execution time comparison
plt.subplot(1, 2, 1)
algorithms = ['Apriori', 'FP-Growth']
times = [apriori_time, fpgrowth_time]
colors = ['skyblue', 'lightcoral']
bars = plt.bar(algorithms, times, color=colors)
plt.ylabel('Execution Time (seconds)')
plt.title('Algorithm Performance Comparison')
for bar, time in zip(bars, times):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
             f'{time:.4f}s', ha='center', va='bottom')

# Itemsets found comparison
plt.subplot(1, 2, 2)
itemset_counts = [len(frequent_itemsets_apriori), len(frequent_itemsets_fpgrowth)]
bars = plt.bar(algorithms, itemset_counts, color=colors)
plt.ylabel('Number of Frequent Itemsets')
plt.title('Frequent Itemsets Found')
for bar, count in zip(bars, itemset_counts):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
             str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("LAB COMPLETION SUMMARY")
print("="*50)
print("Data preparation and cleaning completed")
print("Exploratory data analysis with visualizations")
print("Apriori algorithm implementation")
print("FP-Growth algorithm implementation")
print("Association rule generation and analysis")
print("Comparative analysis between algorithms")
print("Performance and efficiency evaluation")



