In [11]:
# Importing necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [13]:
# Load the dataset with no headers
data = pd.read_csv('Online retail.csv', header=None)

# Converting transactions into lists of items for each row
transactions = data[0].str.split(',')


#### Data Preprocessing

In [19]:
# Step 1: Remove any rows that are completely empty (if any)
data.dropna(how='all', inplace=True)

In [20]:
# Step 2: Convert each transaction row to a list of items by splitting on commas
transactions = data[0].str.split(',')

In [21]:
# Step 3: Remove any potential duplicates within each transaction
transactions = transactions.apply(lambda x: list(set(x)))

In [23]:
# Step 4: Filter out empty items and strip whitespace from item names
transactions = transactions.apply(lambda x: [item.strip() for item in x if item.strip() != ''])

In [24]:
# Transforming list of transactions into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
basket = pd.DataFrame(te_ary, columns=te.columns_)

In [25]:
# Applying Apriori to find frequent itemsets with a minimum support threshold
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)


In [26]:
# Generating association rules with minimum confidence and lift thresholds
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [27]:
# Displaying the top 10 rules based on lift
print(rules.sort_values(by='lift', ascending=False).head(10))

                    antecedents                 consequents  \
214             (herb & pepper)               (ground beef)   
215               (ground beef)             (herb & pepper)   
387               (ground beef)  (spaghetti, mineral water)   
382  (spaghetti, mineral water)               (ground beef)   
398                 (olive oil)  (spaghetti, mineral water)   
395  (spaghetti, mineral water)                 (olive oil)   
193         (frozen vegetables)                  (tomatoes)   
192                  (tomatoes)         (frozen vegetables)   
188                    (shrimp)         (frozen vegetables)   
189         (frozen vegetables)                    (shrimp)   

     antecedent support  consequent support   support  confidence      lift  \
214            0.049460            0.098254  0.015998    0.323450  3.291994   
215            0.098254            0.049460  0.015998    0.162822  3.291994   
387            0.098254            0.059725  0.017064    0.173677  2.

In [28]:
# Interpreting the rules
for index, rule in rules.iterrows():
    print(f"Rule: {rule['antecedents']} -> {rule['consequents']}")
    print(f"Support: {rule['support']}, Confidence: {rule['confidence']}, Lift: {rule['lift']}\n")

Rule: frozenset({'avocado'}) -> frozenset({'mineral water'})
Support: 0.011598453539528063, Confidence: 0.348, Lift: 1.4599261744966443

Rule: frozenset({'mineral water'}) -> frozenset({'avocado'})
Support: 0.011598453539528063, Confidence: 0.04865771812080537, Lift: 1.4599261744966443

Rule: frozenset({'cake'}) -> frozenset({'burgers'})
Support: 0.011465137981602452, Confidence: 0.14144736842105263, Lift: 1.622319129245131

Rule: frozenset({'burgers'}) -> frozenset({'cake'})
Support: 0.011465137981602452, Confidence: 0.1314984709480122, Lift: 1.6223191292451309

Rule: frozenset({'chocolate'}) -> frozenset({'burgers'})
Support: 0.017064391414478068, Confidence: 0.10414971521562244, Lift: 1.1945367183981406

Rule: frozenset({'burgers'}) -> frozenset({'chocolate'})
Support: 0.017064391414478068, Confidence: 0.19571865443425074, Lift: 1.1945367183981406

Rule: frozenset({'eggs'}) -> frozenset({'burgers'})
Support: 0.02879616051193174, Confidence: 0.1602373887240356, Lift: 1.83782974437154

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


### Interview Questions
1. **What is lift and why is it important in Association rules?**  
   - **Lift** measures how much more likely two items are to be bought together compared to being bought independently. It helps identify strong associations; a lift greater than 1 indicates a positive association, meaning items appear together more often than by chance. It’s important because it helps filter out random associations and focuses on meaningful relationships.

2. **What is support and confidence? How do you calculate them?**  
   - **Support** is the frequency with which an itemset appears in the dataset, indicating its popularity. **Confidence** measures the likelihood of one item being purchased given that another item is already in the basket, showing the strength of the association. They are used to determine the significance and reliability of association rules.

3. **What are some limitations or challenges of Association rules mining?**  
   - Association rule mining can produce an overwhelming number of rules, many of which may be irrelevant or redundant. It’s also computationally intensive with large datasets. Another challenge is setting appropriate thresholds for support, confidence, and lift to filter out unimportant rules while retaining useful insights.