In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
df = pd.read_csv('Market_Basket_Optimisation.csv', header=None)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [2]:
# Display information about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
 1   1       5747 non-null   object
 2   2       4389 non-null   object
 3   3       3345 non-null   object
 4   4       2529 non-null   object
 5   5       1864 non-null   object
 6   6       1369 non-null   object
 7   7       981 non-null    object
 8   8       654 non-null    object
 9   9       395 non-null    object
 10  10      256 non-null    object
 11  11      154 non-null    object
 12  12      87 non-null     object
 13  13      47 non-null     object
 14  14      25 non-null     object
 15  15      8 non-null      object
 16  16      4 non-null      object
 17  17      4 non-null      object
 18  18      3 non-null      object
 19  19      1 non-null      object
dtypes: object(20)
memory usage: 1.1+ MB


In [3]:
# Fill missing values with 0
df.fillna(0, inplace=True)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


   This code fills missing values in the dataset with 0 using the fillna() function.
    inplace=True modifies the original DataFrame rather than returning a new one.
    df.head() displays the first few rows of the modified DataFrame after filling missing values.

In [4]:
# Create a list of transactions
transactions = []
for i in range(0, len(df)):
    transactions.append([str(df.values[i, j]) for j in range(0, 20) if str(df.values[i, j]) != '0'])

In [5]:
# Display the first transaction
transactions[0]

['shrimp',
 'almonds',
 'avocado',
 'vegetables mix',
 'green grapes',
 'whole weat flour',
 'yams',
 'cottage cheese',
 'energy drink',
 'tomato juice',
 'low fat yogurt',
 'green tea',
 'honey',
 'salad',
 'mineral water',
 'salmon',
 'antioxydant juice',
 'frozen smoothie',
 'spinach',
 'olive oil']

In [6]:
# Calculate the total number of transactions
len(transactions)

7501

# Training the Apriori on the Dataset

- `min_support` is 3 out of 1000. Meaning there are some people who bought it.
- `min_confidence` is at least 20%. Meaning at least 20% of who bought item A, will buy item B.
- `min_lift` is 3. 
- `max_length` is Maximum number of items that are bought together.

In [7]:
# Import the Apriori algorithm from the apyori library
from apyori import apriori

# Apply the Apriori algorithm to find association rules
rules = apriori(transactions, min_support=0.003, min_confidence=0.2, min_lift=3, max_length=3) # adjust your association here

# Convert the rules to a list
results = list(rules)

# Save the transactions and results to CSV files
df1 = pd.DataFrame(transactions)
df1.to_csv("transactions.csv")

df2 = pd.DataFrame(results)
df2.to_csv("results.csv")


In [8]:
# Read the results from the CSV file
results = pd.read_csv('results.csv', header=None)

# Display the first 10 rows of the results
results.head(10)


Unnamed: 0,0,1,2,3
0,,items,support,ordered_statistics
1,0.0,"frozenset({'light cream', 'chicken'})",0.004532728969470737,[OrderedStatistic(items_base=frozenset({'light...
2,1.0,"frozenset({'mushroom cream sauce', 'escalope'})",0.005732568990801226,[OrderedStatistic(items_base=frozenset({'mushr...
3,2.0,"frozenset({'pasta', 'escalope'})",0.005865884548726837,[OrderedStatistic(items_base=frozenset({'pasta...
4,3.0,"frozenset({'fromage blanc', 'honey'})",0.003332888948140248,[OrderedStatistic(items_base=frozenset({'froma...
5,4.0,"frozenset({'herb & pepper', 'ground beef'})",0.015997866951073192,[OrderedStatistic(items_base=frozenset({'herb ...
6,5.0,"frozenset({'tomato sauce', 'ground beef'})",0.005332622317024397,[OrderedStatistic(items_base=frozenset({'tomat...
7,6.0,"frozenset({'olive oil', 'light cream'})",0.003199573390214638,[OrderedStatistic(items_base=frozenset({'light...
8,7.0,"frozenset({'olive oil', 'whole wheat pasta'})",0.007998933475536596,[OrderedStatistic(items_base=frozenset({'whole...
9,8.0,"frozenset({'shrimp', 'pasta'})",0.005065991201173177,[OrderedStatistic(items_base=frozenset({'pasta...


In [9]:
# Exclude the first row (which contains column names) and sort the results by lift in descending order
resultz = results.iloc[1:, :]
resultz.sort_values(by=[2], ascending=False)


Unnamed: 0,0,1,2,3
5,4.0,"frozenset({'herb & pepper', 'ground beef'})",0.0159978669510731,[OrderedStatistic(items_base=frozenset({'herb ...
27,26.0,"frozenset({'frozen vegetables', 'spaghetti', '...",0.0086655112651646,[OrderedStatistic(items_base=frozenset({'froze...
8,7.0,"frozenset({'olive oil', 'whole wheat pasta'})",0.0079989334755365,[OrderedStatistic(items_base=frozenset({'whole...
31,30.0,"frozenset({'frozen vegetables', 'shrimp', 'min...",0.0071990401279829,[OrderedStatistic(items_base=frozenset({'shrim...
49,48.0,"frozenset({'olive oil', 'spaghetti', 'milk'})",0.0071990401279829,[OrderedStatistic(items_base=frozenset({'spagh...
39,38.0,"frozenset({'mineral water', 'herb & pepper', '...",0.0066657778962804,[OrderedStatistic(items_base=frozenset({'miner...
35,34.0,"frozenset({'tomatoes', 'frozen vegetables', 's...",0.0066657778962804,[OrderedStatistic(items_base=frozenset({'froze...
40,39.0,"frozenset({'spaghetti', 'herb & pepper', 'grou...",0.0063991467804292,[OrderedStatistic(items_base=frozenset({'spagh...
33,32.0,"frozenset({'frozen vegetables', 'shrimp', 'spa...",0.0059992001066524,[OrderedStatistic(items_base=frozenset({'froze...
44,43.0,"frozenset({'spaghetti', 'shrimp', 'ground beef'})",0.0059992001066524,[OrderedStatistic(items_base=frozenset({'shrim...


# Explanation


The provided data shows association rules discovered by applying the Apriori algorithm to a dataset of transactions. Let's break down the information:

    Items: This column shows the combination of items that are found together in transactions. Each row represents a specific combination of items.

    Support: Support is a measure of how frequently an itemset appears in the dataset. It indicates the proportion of transactions in which the itemset appears.

    Ordered_Statistics: This column contains detailed information about the association rule, including:
        Items_Base: The antecedent (or left-hand side) of the association rule.
        Items_Add: The consequent (or right-hand side) of the association rule.
        Confidence: Confidence is a measure of the reliability of the rule. It indicates the likelihood that the consequent appears in a transaction given that the antecedent appears.
        Lift: Lift measures how much more likely the consequent is to be found in transactions containing the antecedent compared to its expected frequency. A lift greater than 1 indicates that the antecedent and consequent appear together more often than expected by chance.

Let's interpret the first association rule:

    Items: 'light cream' and 'chicken'
    Support: 0.0045
    Ordered_Statistics: Confidence=0.2906, Lift=4.8439

Interpretation:

    The association rule indicates that there is a support of 0.0045 for transactions containing both 'light cream' and 'chicken'.
    The confidence of the rule is 0.2906, which means that 29.06% of transactions containing 'light cream' also contain 'chicken'.
    The lift of the rule is 4.8439, which means that transactions containing both 'light cream' and 'chicken' are approximately 4.84 times more likely than transactions containing 'chicken' alone. This indicates a strong positive association between 'light cream' and 'chicken'.




# Another example

    Association Rule: {light cream} -> {chicken}
    Support: 0.0045
    Confidence: 0.2906
    Lift: 4.8439

Interpretation:

    The association rule indicates that there is a support of 0.0045 for transactions containing both 'light cream' and 'chicken'.
    The confidence of the rule is 0.2906, which means that 29.06% of transactions containing 'light cream' also contain 'chicken'.
    The lift of the rule is 4.8439, which means that transactions containing both 'light cream' and 'chicken' are approximately 4.84 times more likely than transactions containing 'chicken' alone. This indicates a strong positive association between 'light cream' and 'chicken'.

# How to choose the best association?

High Support: Look for association rules with high support values. High support indicates that the itemset occurs frequently in the dataset, making the rule more reliable.

High Confidence: Focus on association rules with high confidence values. High confidence indicates that the consequent is likely to occur when the antecedent is present. Rules with high confidence are more reliable and have a stronger predictive power.

Significant Lift: Look for association rules with lift values significantly greater than 1. A lift greater than 1 indicates that the antecedent and consequent occur together more frequently than expected by chance. Significant lift values suggest a meaningful association between the items.