In [1]:
import pandas as pd
# Mining: 
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv("../01_artwork_clustering/omniart-post-artwork-clustering.csv")

In [3]:
artwork_clusters = pd.read_csv("../01_artwork_clustering/artwork-centroids.csv")

In [4]:
total_transactions = len(df)
total_transactions

264218

In [5]:
min_support = 0.001 # 0.1% -> 0.001*264218 = 264 transactions

In [6]:
def get_freq_itemset(df, dims, min_support):
    mining_df = df.copy()
    mining_df["cluster"] = "cluster_" + mining_df["cluster"].astype(str)
    for dim in dims:
        mining_df[f"{dim}"] = f"{dim}_" + mining_df[f"{dim}"].astype(str)

    # One-Hot Encode
    df_encoded = pd.get_dummies(mining_df, prefix="", prefix_sep="")
    df_encoded = df_encoded.astype(bool)
    
    # Frequent Itemsets with FP-Growth
    freq_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)

    return freq_itemsets

In [7]:
min_lift = 1.1 # 10% more likely to be related

In [8]:
def get_association_rules(freq_itemsets, min_threshold, metric="lift"):
    rules = association_rules(freq_itemsets, metric=metric, min_threshold=min_threshold)
    return rules.sort_values(by="lift", ascending=False)

**1. Decade**

In [9]:
decade_mining_df = df[["cluster", "decade"]].copy()

In [10]:
decade_freq_itemset = get_freq_itemset(decade_mining_df, ["decade"], min_support=min_support)

In [11]:
decade_rules = get_association_rules(decade_freq_itemset, min_threshold=min_lift)

In [12]:
decade_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
99,(cluster_54),(decade_2000),0.006090,0.174629,0.001779,0.292107,1.672733,1.0,0.000715,1.165955,0.404640,0.009941,0.142334,0.151147
98,(decade_2000),(cluster_54),0.174629,0.006090,0.001779,0.010186,1.672733,1.0,0.000715,1.004139,0.487267,0.009941,0.004122,0.151147
58,(cluster_56),(decade_2000),0.007305,0.174629,0.002082,0.284974,1.631887,1.0,0.000806,1.154324,0.390062,0.011574,0.133692,0.148447
59,(decade_2000),(cluster_56),0.174629,0.007305,0.002082,0.011920,1.631887,1.0,0.000806,1.004671,0.469137,0.011574,0.004650,0.148447
5,(decade_2000),(cluster_33),0.174629,0.004837,0.001340,0.007672,1.586197,1.0,0.000495,1.002857,0.447752,0.007522,0.002849,0.142334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,(decade_2000),(cluster_36),0.174629,0.011309,0.002199,0.012592,1.113475,1.0,0.000224,1.001300,0.123472,0.011968,0.001298,0.103518
117,(decade_2000),(cluster_18),0.174629,0.008414,0.001635,0.009363,1.112831,1.0,0.000166,1.000958,0.122843,0.009013,0.000957,0.101847
116,(cluster_18),(decade_2000),0.008414,0.174629,0.001635,0.194332,1.112831,1.0,0.000166,1.024456,0.102251,0.009013,0.023872,0.101847
18,(decade_2010),(cluster_55),0.524480,0.010586,0.006154,0.011734,1.108407,1.0,0.000602,1.001161,0.205679,0.011635,0.001160,0.296535


**2. School**

In [13]:
school_mining_df = df[["cluster", "school"]]

In [14]:
school_freq_itemset = get_freq_itemset(school_mining_df, ["school"], min_support=0.0001)

In [15]:
school_rules = get_association_rules(school_freq_itemset, min_threshold=min_lift)

In [16]:
school_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
501,(cluster_52),(school_ India),0.021301,0.001991,0.000409,0.019190,9.639318,1.0,0.000366,1.017535,0.915765,0.017863,0.017233,0.112256
500,(school_ India),(cluster_52),0.001991,0.021301,0.000409,0.205323,9.639318,1.0,0.000366,1.231569,0.898046,0.017863,0.188028,0.112256
468,(cluster_32),(school_ Japan),0.015033,0.003857,0.000492,0.032729,8.486377,1.0,0.000434,1.029849,0.895628,0.026743,0.028984,0.080153
469,(school_ Japan),(cluster_32),0.003857,0.015033,0.000492,0.127576,8.486377,1.0,0.000434,1.129000,0.885579,0.026743,0.114261,0.080153
517,(cluster_59),(school_ China),0.016604,0.003683,0.000500,0.030089,8.170636,1.0,0.000438,1.027226,0.892428,0.025249,0.026504,0.082876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,(cluster_43),(school_ Unknown),0.011752,0.177350,0.002305,0.196135,1.105923,1.0,0.000221,1.023369,0.096917,0.012339,0.022835,0.104566
324,(cluster_92),(school_ modern),0.010771,0.696921,0.008300,0.770555,1.105657,1.0,0.000793,1.320924,0.096601,0.011867,0.242954,0.391232
325,(school_ modern),(cluster_92),0.696921,0.010771,0.008300,0.011909,1.105657,1.0,0.000793,1.001152,0.315298,0.011867,0.001150,0.391232
358,(cluster_3),(school_ Netherlands),0.007906,0.018193,0.000159,0.020105,1.105094,1.0,0.000015,1.001951,0.095857,0.006128,0.001947,0.014421


**3. Decades + Schools**

In [17]:
decade_school_mining_df = df[["cluster", "decade", "school"]]

In [18]:
decade_school_freq_itemset = get_freq_itemset(decade_school_mining_df, ["decade", "school"], min_support=min_support)

In [20]:
decade_school_rules = get_association_rules(decade_school_freq_itemset, min_threshold=min_lift)

In [21]:
decade_school_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
659,(school_ Netherlands),(decade_1660),0.018193,0.004958,0.002263,0.124402,25.091011,1.0,0.002173,1.136414,0.977937,0.108353,0.120039,0.290445
658,(decade_1660),(school_ Netherlands),0.004958,0.018193,0.002263,0.456489,25.091011,1.0,0.002173,1.806414,0.964929,0.108353,0.446417,0.290445
253,(school_ Netherlands),(decade_1670),0.018193,0.004973,0.002252,0.123778,24.889139,1.0,0.002161,1.135587,0.977608,0.107673,0.119398,0.288297
252,(decade_1670),(school_ Netherlands),0.004973,0.018193,0.002252,0.452816,24.889139,1.0,0.002161,1.794289,0.964619,0.107673,0.442676,0.288297
472,(decade_1620),(school_ Flanders),0.008603,0.012232,0.001896,0.220414,18.018944,1.0,0.001791,1.267041,0.952699,0.100120,0.210759,0.187713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,(school_ Unknown),(cluster_43),0.177350,0.011752,0.002305,0.012996,1.105923,1.0,0.000221,1.001261,0.116427,0.012339,0.001260,0.104566
601,(school_ modern),(cluster_92),0.696921,0.010771,0.008300,0.011909,1.105657,1.0,0.000793,1.001152,0.315298,0.011867,0.001150,0.391232
600,(cluster_92),(school_ modern),0.010771,0.696921,0.008300,0.770555,1.105657,1.0,0.000793,1.320924,0.096601,0.011867,0.242954,0.391232
823,"(decade_2010, school_ modern)",(cluster_65),0.523867,0.010207,0.005885,0.011234,1.100598,1.0,0.000538,1.001039,0.191969,0.011142,0.001037,0.293900
