In [31]:
import pandas as pd

In [32]:
df = pd.read_excel('Online retail.xlsx',names=['items'])

In [33]:
df.head()

Unnamed: 0,items
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   items   7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [35]:
df.shape

(7500, 1)

In [36]:
df[df.duplicated()]

Unnamed: 0,items
33,cookies
41,spaghetti
59,spaghetti
63,"turkey,eggs"
64,french fries
...,...
7490,herb & pepper
7491,"chocolate,escalope"
7494,"pancakes,light mayo"
7497,chicken


In [37]:
df.drop_duplicates(inplace=True)

In [38]:
df.isnull().sum()

items    0
dtype: int64

In [39]:
df['items'].unique()

array(['burgers,meatballs,eggs', 'chutney', 'turkey,avocado', ...,
       'butter,light mayo,fresh bread',
       'burgers,frozen vegetables,eggs,french fries,magazines,green tea',
       'eggs,frozen smoothie,yogurt cake,low fat yogurt'], dtype=object)

In [40]:
# Split the 'items' column into a list of products for each transaction
df['Items'] = df['items'].apply(lambda x: x.split(','))

# Optionally, remove leading and trailing spaces from each product name
df['Items'] = df['Items'].apply(lambda x: [item.strip() for item in x])

df.head()

Unnamed: 0,items,Items
0,"burgers,meatballs,eggs","[burgers, meatballs, eggs]"
1,chutney,[chutney]
2,"turkey,avocado","[turkey, avocado]"
3,"mineral water,milk,energy bar,whole wheat rice...","[mineral water, milk, energy bar, whole wheat ..."
4,low fat yogurt,[low fat yogurt]


In [41]:
from mlxtend.preprocessing import TransactionEncoder

# Initialize TransactionEncoder
te = TransactionEncoder()

# Transform the dataset
te_ary = te.fit(df['Items']).transform(df['Items'])

# Convert the array into a DataFrame
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

df_encoded.head()


Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [43]:
from mlxtend.frequent_patterns import apriori, association_rules

# Find frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
print(frequent_itemsets.head())

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()

    support             itemsets
0  0.029179            (almonds)
1  0.011014  (antioxydant juice)
2  0.045797            (avocado)
3  0.012560              (bacon)
4  0.015459     (barbecue sauce)


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"(chocolate, chicken)",(mineral water),0.011014,0.518182,1.728943
1,"(chocolate, olive oil)",(mineral water),0.011981,0.508197,1.695627
2,"(eggs, ground beef)",(mineral water),0.014493,0.503356,1.679475
3,"(frozen vegetables, ground beef)",(mineral water),0.013333,0.543307,1.812775
4,"(frozen vegetables, ground beef)",(spaghetti),0.01256,0.511811,2.22948


In [47]:
# Lower the thresholds for confidence and lift
filtered_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.0)]

# Display the filtered rules
filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"(chocolate, chicken)",(mineral water),0.011014,0.518182,1.728943
1,"(chocolate, olive oil)",(mineral water),0.011981,0.508197,1.695627
2,"(eggs, ground beef)",(mineral water),0.014493,0.503356,1.679475
3,"(frozen vegetables, ground beef)",(mineral water),0.013333,0.543307,1.812775
4,"(frozen vegetables, ground beef)",(spaghetti),0.01256,0.511811,2.22948
5,"(milk, ground beef)",(mineral water),0.016039,0.506098,1.688623
6,"(pancakes, ground beef)",(mineral water),0.010821,0.518519,1.730067
7,"(milk, olive oil)",(mineral water),0.012367,0.512,1.708317
8,"(milk, soup)",(mineral water),0.012367,0.576577,1.923781
9,"(spaghetti, soup)",(mineral water),0.010821,0.523364,1.746235


In [30]:
# Filter rules based on high confidence and lift values
filtered_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.2)]
filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,"(chocolate, chicken)",(mineral water),0.011014,0.518182,1.728943
1,"(chocolate, olive oil)",(mineral water),0.011981,0.508197,1.695627
2,"(eggs, ground beef)",(mineral water),0.014493,0.503356,1.679475
3,"(frozen vegetables, ground beef)",(mineral water),0.013333,0.543307,1.812775
4,"(frozen vegetables, ground beef)",(spaghetti),0.01256,0.511811,2.22948
5,"(milk, ground beef)",(mineral water),0.016039,0.506098,1.688623
6,"(pancakes, ground beef)",(mineral water),0.010821,0.518519,1.730067
7,"(milk, olive oil)",(mineral water),0.012367,0.512,1.708317
8,"(milk, soup)",(mineral water),0.012367,0.576577,1.923781
9,"(spaghetti, soup)",(mineral water),0.010821,0.523364,1.746235


# Interview Questions:

1. What is lift and why is it important in Association rules? 

Ans:Lift is a measure used in association rule mining to evaluate the strength of an association rule compared to the expected likelihood of the items being associated if they were independent. In simpler terms, lift tells you how much more likely the items in the rule are to occur together than would be expected by chance.

2. What is support and Confidence. How do you calculate them?

Ans:

->Support: Support measures how frequently an itemset appears in the dataset. It helps to identify how common or rare an itemset is. Support of an itemset A is defined as the proportion of transactions in the dataset that contain the itemset A

->Confidence measures the likelihood that itemset B is purchased when itemset 𝐴 is purchased. It evaluates the reliability of the rule A→𝐵

