In [38]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [47]:
df=pd.read_excel('Online retail.xlsx',header=None)

In [48]:
transactions=df[0].apply(lambda x: x.split(','))

In [49]:
te=TransactionEncoder()
te_array=te.fit(transactions).transform(transactions)

df_encoded=pd.DataFrame(te_array,columns=te.columns_)

In [50]:
frequent_itemsets=apriori(df_encoded,min_support=0.2,use_colnames=True)

In [51]:
rules=association_rules(frequent_itemsets,metric="lift",min_threshold=1.0)

In [52]:
print(rules[['antecedents','consequents','support','confidence','lift']])

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [3]:
###Analysis and Interpretation¶

#After applying the Apriori algorithm on the Online Retail dataset, several interesting association rules were discovered.
#Products like mineral water, frozen smoothie, and green tea frequently appear in combination with other healthy products such as salmon, avocado, and vegetables mix.
#One notable rule suggests that customers who purchase green tea and low fat yogurt are also likely to purchase frozen smoothie, with a high confidence and lift value, indicating a strong relationship.
#The lift values greater than 1 in several rules show that these item combinations occur more often than expected by chance, suggesting strong associations.
#Many rules involve combinations of health-conscious items, indicating a segment of health-oriented customers whose purchasing patterns can be targeted for promotions or bundling offers.
#These insights can help retailers in:

#Product placement: placing commonly bought-together items near each other.
#Targeted promotions: bundling frequently associated products.
#Inventory planning: stocking up on items that are often purchased together.

In [1]:
#1. What is lift and why is it important in Association Rules?

#Lift is a measure used to evaluate the performance of an association rule compared to random chance. It tells us how much more likely the occurrence of item B is given item A, relative to the likelihood of B occurring at all.
#Lift(A -> B) = Confidence(A -> B) / Support(B)
#Lift > 1: Positive correlation – the items appear together more than expected.
#Lift = 1: No correlation – the items appear together as expected.
#Lift < 1: Negative correlation – the items appear together less than expected.

In [2]:
#2. What is Support and Confidence? How do you calculate them?

#Support: Measures how frequently an itemset appears in the dataset.
#Support(A -> B) = (Transactions containing A and B) / Total transactions
#It indicates how popular or common the rule is in the dataset.
#Confidence: Measures how often the rule has been found to be true.
#Confidence(A -> B) = (Transactions containing A and B) / (Transactions containing A)
#It shows the reliability of the rule – how likely B is purchased when A is purchased.

#Example:
#If 100 out of 1,000 transactions include both milk and bread, and 200 transactions include milk:
#Support(Milk -> Bread) = 100 / 1000 = 0.10
#Confidence(Milk → Bread) = 100 / 200 = 0.50

In [None]:
#3. What are some limitations or challenges of Association Rules Mining?

#Too many rules: Association rule mining can generate a huge number of rules, many of which may be trivial or irrelevant.
#Rare item problem: Important but infrequent associations may be missed if the minimum support threshold is too high.
#Subjectivity in thresholds: Setting the right support, confidence, and lift thresholds requires domain knowledge and may affect outcomes drastically.
#Scalability: It can be computationally expensive on large datasets due to the combinatorial explosion of itemsets.
#Lack of causality: Association rules do not imply causation, only co-occurrence.
#Redundancy: Many rules may be variations of the same association, leading to redundant information.