In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


def run_apriori(data, min_support=0.05, min_confidence=0.2, min_lift=2.0, min_length=2, max_length=3):
    """
    Runs the Apriori algorithm on the specified DataFrame of transaction data and returns a DataFrame of association rules
    that satisfy the minimum support, minimum confidence, minimum lift, minimum length, and maximum length criteria.

    Parameters:
    data (pandas.DataFrame): The DataFrame of transaction data to analyze.
    min_support (float): The minimum support threshold for frequent itemset mining. Default is 0.05.
    min_confidence (float): The minimum confidence threshold for association rule generation. Default is 0.2.
    min_lift (float): The minimum lift threshold for association rule generation. Default is 2.0.
    min_length (int): The minimum length of the antecedent and consequent itemsets. Default is 2.
    max_length (int): The maximum length of the antecedent and consequent itemsets. Default is 3.

    Returns:
    pandas.DataFrame: A DataFrame of association rules that satisfy the specified criteria.

    Raises:
    TypeError: If the data parameter is not a pandas DataFrame.
    ValueError: If the min_support, min_confidence, or min_lift parameter is not a float between 0 and 1, or if the
                min_length or max_length parameter is not a positive integer.
    """

    # Validate input parameters
    if not isinstance(data, pd.DataFrame):
        raise TypeError("data parameter must be a pandas DataFrame")

    if not isinstance(min_support, float) or not 0 <= min_support <= 1:
        raise ValueError("min_support parameter must be a float between 0 and 1")

    if not isinstance(min_confidence, float) or not 0 <= min_confidence <= 1:
        raise ValueError("min_confidence parameter must be a float between 0 and 1")

    if not isinstance(min_lift, float) or not 0 <= min_lift:
        raise ValueError("min_lift parameter must be a float greater than or equal to 0")

    if not isinstance(min_length, int) or min_length <= 0:
        raise ValueError("min_length parameter must be a positive integer")

    if not isinstance(max_length, int) or max_length <= 0:
        raise ValueError("max_length parameter must be a positive integer")

    # Convert data to a list of transactions
    transactions = [[str(data.values[i,j]) for j in range(data.shape[1])] for i in range(data.shape[0])]

    # Convert transactions to a binary-encoded DataFrame
    te = TransactionEncoder()
    te_ary = te.fit_transform(transactions)
    df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

    # Run Apriori algorithm
    frequent_itemsets = apriori(df_transactions, min_support=min_support, max_len=max_length, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
    rules = rules[(rules['confidence'] >= min_confidence) & (rules['lift'] >= min_lift)]
    rules = rules.sort_values(by=['lift'], ascending=[False])

    # Convert antecedents and consequents to lists
    rules['antecedents'] = rules['antecedents'].map(list)
    rules['consequents'] = rules['consequents'].map(list)

    # Organize results into a DataFrame
    results = pd.DataFrame({
        'Antecedent': rules['antecedents'],
        'Consequent': rules['consequents'],
        'Support': rules['support'],
        'Confidence': rules['confidence'],
        'Lift': rules['lift']
    })

    return results

In [4]:
df = pd.read_csv('/home/young78703/Data_Science_Project/data/Market_Basket_Optimisation.csv',header=None)

In [5]:
run_apriori(df, min_support=0.01, min_confidence=0.3, min_lift=2.0, min_length=2, max_length=3)

Unnamed: 0,Antecedent,Consequent,Support,Confidence,Lift
7,[herb & pepper],[ground beef],0.015998,0.32345,3.291994
47,"[nan, herb & pepper]",[ground beef],0.015998,0.32345,3.291994
49,[herb & pepper],"[ground beef, nan]",0.015998,0.32345,3.291994
56,"[ground beef, mineral water]",[spaghetti],0.017064,0.416938,2.394681
29,"[mineral water, frozen vegetables]",[milk],0.011065,0.309701,2.389991
13,[soup],[milk],0.015198,0.300792,2.321232
73,"[soup, nan]",[milk],0.015198,0.300792,2.321232
75,[soup],"[milk, nan]",0.015198,0.300792,2.321232
10,[ground beef],[spaghetti],0.039195,0.398915,2.291162
64,"[ground beef, nan]",[spaghetti],0.039195,0.398915,2.291162


This is the output of the `run_apriori` function, which returns a Pandas DataFrame containing the association rules that satisfy the specified minimum support, minimum confidence, minimum lift, minimum length, and maximum length criteria.

Each row of the DataFrame represents an association rule, and the columns represent the antecedent, consequent, support, confidence, and lift values for each rule.

- The `Antecedent` column contains the items that appear before the arrow (->) in the association rule.
- The `Consequent` column contains the items that appear after the arrow (->) in the association rule.
- The `Support` column contains the proportion of transactions that contain both the antecedent and consequent items.
- The `Confidence` column contains the proportion of transactions that contain both the antecedent and consequent items, out of the transactions that contain the antecedent items.
- The `Lift` column represents the ratio of the observed support to the expected support if the antecedent and consequent were independent.

For example, the first row of the DataFrame shows that the combination of `herb & pepper` and `nan` appears together in 1.6% of the transactions, and that 32.3% of the transactions that contain `herb & pepper` and `nan` also contain `ground beef`. The lift value of 3.291994 indicates that the occurrence of `ground beef` is 3.29 times more likely when `herb & pepper` and `nan` appear together, compared to when they appear independently.

A brief explanation of how support, confidence, and lift are computed in the Apriori algorithm:

- Support: The support of an itemset is the proportion of transactions in the dataset that contain that itemset. It is computed as the number of transactions containing the itemset divided by the total number of transactions in the dataset. For example, if there are 1000 transactions in the dataset, and the itemset {A, B} appears in 100 of those transactions, then the support of {A, B} is 0.1 or 10%.

- Confidence: The confidence of an association rule A -> B is the proportion of transactions containing A that also contain B. It is computed as the number of transactions containing both A and B divided by the number of transactions containing A. For example, if there are 100 transactions containing A, and 50 of those transactions also contain B, then the confidence of the rule A -> B is 0.5 or 50%.

- Lift: The lift of an association rule A -> B measures the degree of dependence between A and B, and is computed as the ratio of the observed support of A and B to the expected support of A and B if A and B were statistically independent. If the lift is greater than 1, it indicates that the occurrence of A and B together is more likely than would be expected by chance, and suggests a positive correlation between A and B. If the lift is less than 1, it indicates that A and B are less likely to occur together than would be expected by chance, and suggests a negative correlation between A and B. If the lift is equal to 1, it indicates that A and B are statistically independent.