In [20]:
import os

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [4]:
df = pd.read_csv("data/data.csv")
df

Unnamed: 0,idUsuario,artista,sexo,pais
0,5,dream theater,m,Finland
1,5,ac/dc,m,Finland
2,5,metallica,m,Finland
3,5,iron maiden,m,Finland
4,5,bob marley & the wailers,m,Finland
...,...,...,...,...
58450,19714,led zeppelin,m,United Kingdom
58451,19714,slipknot,m,United Kingdom
58452,19714,children of bodom,m,United Kingdom
58453,19714,vader,m,United Kingdom


In [5]:
transactions = df.groupby("idUsuario")["artista"].aggregate(list)
transactions

idUsuario
5        [dream theater, ac/dc, metallica, iron maiden,...
9        [arch enemy, strapping young lad, kreator, chi...
14       [sonic youth, the fall, mogwai, the velvet und...
22       [the jam, maxïmo park, supergrass, simon & gar...
24       [the byrds, leonard cohen, the beatles, simon ...
                               ...                        
19688    [bullet for my valentine, escape the fate, 30 ...
19701    [placebo, animal collective, crystal castles, ...
19703    [the offspring, pearl jam, the smashing pumpki...
19708    [coldplay, amy winehouse, armin van buuren, mo...
19714    [misfits, type o negative, arch enemy, red hot...
Name: artista, Length: 3000, dtype: object

In [6]:
encoder = TransactionEncoder()
df_t = pd.DataFrame(encoder.fit_transform(transactions), columns=encoder.columns_)
df_t

Unnamed: 0,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,[unknown],...,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,yo la tengo,zero 7,Édith piaf
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2997,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [46]:
min_support = 0.05
file_path = f"data/itemsets_{min_support}.csv"

if not os.path.isfile(file_path):
    itemsets = apriori(df_t, min_support=min_support)
    itemsets.to_csv(file_path, index=False)
else:
    itemsets = pd.read_csv(file_path)

itemsets

Unnamed: 0,support,itemsets
0,0.057333,frozenset({17})
1,0.069000,frozenset({28})
2,0.062667,frozenset({48})
3,0.050667,frozenset({66})
4,0.080000,frozenset({70})
...,...,...
62,0.064667,frozenset({939})
63,0.051000,frozenset({961})
64,0.062667,frozenset({977})
65,0.052000,"frozenset({202, 703})"


In [41]:
itemsets["itemsets"].apply(len).describe()

count    8685.000000
mean        2.129649
std         0.600170
min         1.000000
25%         2.000000
50%         2.000000
75%         2.000000
max         4.000000
Name: itemsets, dtype: float64

In [36]:
rules = association_rules(itemsets, metric="confidence", min_threshold=0.7).sort_values(
    by="lift", ascending=False
)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,"(784, 495)",(827),0.017,0.096,0.012,0.705882,7.352941,0.010368,3.0736
1,"(102, 238)",(703),0.013667,0.169667,0.010667,0.780488,4.600125,0.008348,3.78263
2,"(481, 893)",(202),0.017,0.160667,0.012,0.705882,4.393459,0.009269,2.853733
3,"(202, 924)",(703),0.018667,0.169667,0.013333,0.714286,4.209935,0.010166,2.906167
0,"(202, 102)",(703),0.017,0.169667,0.012,0.705882,4.160407,0.009116,2.823133
