In [1]:
import pandas as pd

# Load the dataset
file_path = '../dataset/DATA602ProjectCleanedNew.csv'
df = pd.read_csv(file_path)

# Filter necessary columns
df = df[['InvoiceNo', 'Description', 'Country']]

# Group transactions by InvoiceNo and Country
transactions_by_country = df.groupby(['Country', 'InvoiceNo'])['Description'].apply(list).reset_index()

# Function to convert transactions into vertical format
def transactions_to_vertical_format(transactions):
    vertical_format = {}
    for index, transaction in enumerate(transactions):
        for item in transaction:
            if item not in vertical_format:
                vertical_format[item] = set()
            vertical_format[item].add(index)
    return vertical_format

# Function to perform Eclat algorithm and find frequent itemsets
def eclat(vertical_format, min_support_count, num_transactions):
    frequent_itemsets = []
    item_support = {}

    # Calculate support for single items
    for item, transactions in vertical_format.items():
        support_count = len(transactions)
        if support_count >= min_support_count:
            support = support_count / num_transactions
            frequent_itemsets.append(((item,), support))
            item_support[item] = support

    # Step 1: Generate item pairs (two products)
    items = list(vertical_format.keys())

    for i in range(len(items)):
        for j in range(i + 1, len(items)):
            item_1 = items[i]
            item_2 = items[j]

            # Step 2: Find the intersection of transaction IDs for the two items
            intersection = vertical_format[item_1].intersection(vertical_format[item_2])
            support_count = len(intersection)

            # Step 3: Check if the support count is above the threshold
            if support_count >= min_support_count:
                support = support_count / num_transactions
                frequent_itemsets.append(((item_1, item_2), support))

    return frequent_itemsets, item_support

# Combine cross-sell recommendations for all countries
cross_sell_recommendations_all_countries = []

# Process each country separately
for country in transactions_by_country['Country'].unique():
    country_transactions = transactions_by_country[transactions_by_country['Country'] == country]['Description'].tolist()

    # Convert to vertical format
    vertical_format = transactions_to_vertical_format(country_transactions)

    # Calculate the total number of transactions
    num_transactions = len(country_transactions)

    # Set minimum support to 2% of transactions
    min_support = 0.02
    min_support_count = min_support * num_transactions

    # Get frequent itemsets and their support for single items
    frequent_itemsets, item_support = eclat(vertical_format, min_support_count, num_transactions)

    # Generate cross-sell recommendations with confidence and lift
    for itemset, support in frequent_itemsets:
        if len(itemset) == 2:
            item_1, item_2 = itemset
            support_item_1 = item_support[item_1]
            support_item_2 = item_support[item_2]

            confidence_1_to_2 = support / support_item_1
            confidence_2_to_1 = support / support_item_2
            lift_1_to_2 = support / (support_item_1 * support_item_2)
            lift_2_to_1 = support / (support_item_1 * support_item_2)

            cross_sell_recommendations_all_countries.append({
                "Country": country,
                "Antecedent": item_1,
                "Consequent": item_2,
                "Support": support,
                "Confidence": confidence_1_to_2,
                "Lift": lift_1_to_2
            })
            cross_sell_recommendations_all_countries.append({
                "Country": country,
                "Antecedent": item_2,
                "Consequent": item_1,
                "Support": support,
                "Confidence": confidence_2_to_1,
                "Lift": lift_2_to_1
            })

cross_sell_df_final = pd.DataFrame(cross_sell_recommendations_all_countries)
cross_sell_df_final_sorted = cross_sell_df_final.sort_values(by='Support', ascending=False)
cross_sell_df_final_sorted

Unnamed: 0,Country,Antecedent,Consequent,Support,Confidence,Lift
1244,Germany,ROUND SNACK BOXES SET OF 4 FRUITS,ROUND SNACK BOXES SET OF4 WOODLAND,0.123223,0.787879,3.228008
1245,Germany,ROUND SNACK BOXES SET OF4 WOODLAND,ROUND SNACK BOXES SET OF 4 FRUITS,0.123223,0.504854,3.228008
30,EIRE,GREEN REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER,0.112069,0.962963,7.446914
31,EIRE,ROSES REGENCY TEACUP AND SAUCER,GREEN REGENCY TEACUP AND SAUCER,0.112069,0.866667,7.446914
83,EIRE,PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER,0.090517,0.954545,7.381818
...,...,...,...,...,...,...
1533,United Kingdom,LUNCH BAG SPACEBOY DESIGN,LUNCH BAG DOLLY GIRL DESIGN,0.020653,0.356308,10.317574
1532,United Kingdom,LUNCH BAG DOLLY GIRL DESIGN,LUNCH BAG SPACEBOY DESIGN,0.020653,0.598039,10.317574
1547,United Kingdom,LUNCH BAG CARS BLUE,LUNCH BAG WOODLAND,0.020653,0.361803,8.182401
1555,United Kingdom,JUMBO BAG STRAWBERRY,JUMBO BAG RED RETROSPOT,0.020585,0.569288,7.106721
