# ASSOCIATION RULE LEARNING

📌 The steps to create a association rule learning are as follows: 

  * Import Dataset

  * Data Preprocessing

  * Preparing ARL Data Structure (Invoice-Product Matrix)

  * Application: Association Rule Learning Recommender

In [158]:
!pip install mlxtend
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.width",500)
pd.set_option("display.expand_frame_repr",None)
from mlxtend.frequent_patterns import apriori, association_rules

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import Dataset

If you want to download the dataset, you can use this <a href="https://archive.ics.uci.edu/ml/datasets/Online+Retail+II">link</a>

In [159]:
df_ = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/datasets/online_retail_II.xlsx",sheet_name="Year 2010-2011")
df = df_.copy()
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Data Preprocessing

In [160]:
def outlier_thresholds(dataframe,variable):
  quartile1 = dataframe[variable].quantile(0.01)
  quartile3 = dataframe[variable].quantile(0.99)
  interquartile_range = quartile3 - quartile1
  up_limit = quartile3 + 1.5 * interquartile_range
  low_limit = quartile1 - 1.5 * interquartile_range
  return low_limit,up_limit

In [161]:
def replace_with_thresholds(dataframe,variable):
  low_limit,up_limit = outlier_thresholds(dataframe,variable)
  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [162]:
def data_prep(dataframe):
  dataframe.dropna(inplace=True)
  dataframe = dataframe[~dataframe["Invoice"].str.contains("C",na=False)]
  dataframe = dataframe[dataframe["Quantity"] > 0]
  dataframe = dataframe[dataframe["Price"] > 0]
  replace_with_thresholds(dataframe,"Quantity")
  replace_with_thresholds(dataframe,"Price")
  return dataframe

In [163]:
df = data_prep(df)
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,397885.0,11.83077,25.523052,1.0,2.0,6.0,12.0,298.5
Price,397885.0,2.893492,3.227175,0.001,1.25,1.95,3.75,37.06
Customer ID,397885.0,15294.416882,1713.144421,12346.0,13969.0,15159.0,16795.0,18287.0


## Preparing ARL Data Structure (Invoice-Product Matrix)

In [164]:
def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)

In [165]:
df_fr = df[df["Country"]=="France"]
fr_inv_pro_df = create_invoice_product_df(df_fr,id=True)
fr_inv_pro_df.iloc[0:5,0:20]

StockCode,10002,10120,10125,10135,11001,15036,15039,16012,16048,16218,16219,16225,16236,16237,16238,17174,20615,20617,20658,20665
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
536370,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
537463,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [166]:
def check_id(dataframe, stock_code):
  product_name = dataframe[dataframe["StockCode"]==stock_code][["Description"]].values[0].tolist()
  print(product_name)

In [167]:
check_id(df_fr,10002)

['INFLATABLE POLITICAL GLOBE ']


In [168]:
def create_rules(dataframe, id=True, country="France"):
  dataframe = dataframe[dataframe['Country'] == country]
  dataframe = create_invoice_product_df(dataframe, id)
  frequent_itemsets = apriori(dataframe, min_support=0.01, use_colnames=True)
  rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
  return rules

In [169]:
rules = create_rules(df)

In [170]:
rules[(rules["support"]>0.05) & (rules["confidence"]>0.1) & (rules["lift"]>5)].sort_values("confidence",ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
23707,"(21080, 21094)",(21086),0.102828,0.138817,0.100257,0.975,7.023611,0.085983,34.447301
23706,"(21080, 21086)",(21094),0.102828,0.128535,0.100257,0.975,7.5855,0.08704,34.858612
108820,"(21080, POST, 21086)",(21094),0.084833,0.128535,0.082262,0.969697,7.544242,0.071358,28.758355
108822,"(21080, POST, 21094)",(21086),0.084833,0.138817,0.082262,0.969697,6.98541,0.070486,28.419023
1777,(21094),(21086),0.128535,0.138817,0.123393,0.96,6.915556,0.10555,21.529563


## Application: Association Rule Learning Recommender

In [171]:
def arl_recommender(rules_df, product_id, rec_count=1):
  sorted_rules = rules_df.sort_values("lift", ascending=False)
  recommendation_list = []
  for i, product in enumerate(sorted_rules["antecedents"]):
    for j in list(product):
      if j == product_id:
        recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])
  return recommendation_list[0:rec_count]

In [176]:
arl_recommender(rules, 22492, 2)

[22556, 22551]