In [44]:
import mlxtend

In [45]:
import pandas as pd
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder

In [56]:
df = pd.read_csv('/content/Online retail.csv')

In [57]:
df.shape

(7500, 1)

In [58]:
df.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [60]:
# Convert Dataset in into list transaction
transactions = df.iloc[:,0].str.split(',')
transaction_list = transactions.tolist()

In [61]:
#One hot encoding
TE = TransactionEncoder()
TE_ary = TE.fit(transaction_list).transform(transaction_list)
df = pd.DataFrame(TE_ary,columns = TE.columns_)

In [62]:
df.isna().sum()

Unnamed: 0,0
asparagus,0
almonds,0
antioxydant juice,0
asparagus,0
avocado,0
...,...
whole wheat pasta,0
whole wheat rice,0
yams,0
yogurt cake,0


In [63]:
df.duplicated().sum()

np.int64(2347)

In [65]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
7495,False
7496,False
7497,True
7498,True


In [52]:
#Applying Apriori Algorithm
frequent_itemsets = apriori(df,min_support=0.01,use_colnames= True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.020267,(almonds)
1,0.033200,(avocado)
2,0.010800,(barbecue sauce)
3,0.014267,(black tea)
4,0.011467,(body spray)
...,...,...
254,0.011067,"(milk, ground beef, mineral water)"
255,0.017067,"(spaghetti, ground beef, mineral water)"
256,0.015733,"(milk, spaghetti, mineral water)"
257,0.010267,"(olive oil, spaghetti, mineral water)"


In [53]:
#Generate Association Rules
rules = association_rules(frequent_itemsets,metric='lift',min_threshold = 1)
rules.sort_values('lift',ascending=False)[0:20]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
217,(ground beef),(herb & pepper),0.098267,0.049467,0.016,0.162822,3.291555,1.0,0.011139,1.135402,0.77206,0.121457,0.119255,0.243136
216,(herb & pepper),(ground beef),0.049467,0.098267,0.016,0.32345,3.291555,1.0,0.011139,1.332841,0.732423,0.121457,0.249723,0.243136
385,"(spaghetti, mineral water)",(ground beef),0.059733,0.098267,0.017067,0.285714,2.90754,1.0,0.011197,1.262427,0.697745,0.121097,0.207875,0.229696
388,(ground beef),"(spaghetti, mineral water)",0.098267,0.059733,0.017067,0.173677,2.90754,1.0,0.011197,1.137893,0.727562,0.121097,0.121182,0.229696
399,(olive oil),"(spaghetti, mineral water)",0.065733,0.059733,0.010267,0.156187,2.614731,1.0,0.00634,1.114306,0.661001,0.08912,0.102581,0.164031
398,"(spaghetti, mineral water)",(olive oil),0.059733,0.065733,0.010267,0.171875,2.614731,1.0,0.00634,1.128171,0.656783,0.08912,0.11361,0.164031
195,(tomatoes),(frozen vegetables),0.0684,0.095333,0.016133,0.235867,2.474134,1.0,0.009613,1.183913,0.639564,0.109304,0.155344,0.202549
194,(frozen vegetables),(tomatoes),0.095333,0.0684,0.016133,0.169231,2.474134,1.0,0.009613,1.12137,0.658605,0.109304,0.108234,0.202549
190,(frozen vegetables),(shrimp),0.095333,0.071333,0.016667,0.174825,2.45082,1.0,0.009866,1.125418,0.654355,0.111111,0.111441,0.204235
191,(shrimp),(frozen vegetables),0.071333,0.095333,0.016667,0.233645,2.45082,1.0,0.009866,1.18048,0.637444,0.111111,0.152887,0.204235


# 1.	What is lift and why is it important in Association rules?
**Lift:**

- Lift measures the strength of a relationship between two items.

- It shows how much more likely items are to appear together than if they were independent.

- A lift value greater than 1 indicates a strong positive association, while less than 1 suggests a weak or negative relationship.

# 2.	What is support and Confidence. How do you calculate them?
**Support & Confidence:**

- Support: It represents how frequently an item or itemset appears in the dataset. A higher support value means the itemset is common.

- Confidence: It indicates how often item Y appears in transactions that already contain item X. A higher confidence value suggests a stronger rule.

# 3.	What are some limitations or challenges of Association rules mining?
**Limitations of Association Rule Mining:**

- High Computation: Processing large datasets can be slow and resource-intensive.

- Too Many Rules: It often generates excessive rules, making analysis difficult.

- Irrelevant Rules: Not all high-confidence rules are meaningful.

- Rare Item Problem: Important but less frequent items may be overlooked.

- Scalability Issues: It struggles with real-time and continuously growing data.