In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

Spoznali bomo dva načina uporabe Apriori algoritma. Najprej bomo na redkih (*sparse*) podatkih (košara) iskali povezovalna pravila, nato bomo na naboru podatkov atribut-vrednost iskali klasifikacijska pravila. 

# Povezovalna pravila

Začnimo s podatki o tržni košarici:

In [None]:
df = pd.read_csv('../data/GroceryStoreDataSet.csv', names = ['products'], sep = ',')
df.head()

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,CEDEVITA"
3,"JAM,EGGS,BREAD,MILK"
4,"EGGS,TEA,BISCUIT"


Raziščimo podatke.

In [3]:
df.shape

(20, 1)

Podatkov v tabeli ne moremo neposredno uporabljati; najprej jih moramo preoblikovati v seznam seznamov, ki ga lahko uporabimo za iskanje pogostih postavk.

In [4]:
data = list(df["products"].apply(lambda x:x.split(",") ))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'CEDEVITA'],
 ['JAM', 'EGGS', 'BREAD', 'MILK'],
 ['EGGS', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'CEDEVITA'],
 ['EGGS', 'TEA', 'CORNFLAKES'],
 ['EGGS', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'EGGS', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGAR', 'CEDEVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGAR', 'BISCUIT'],
 ['COFFEE', 'SUGAR', 'CORNFLAKES'],
 ['BREAD', 'SUGAR', 'CEDEVITA'],
 ['BREAD', 'COFFEE', 'SUGAR'],
 ['BREAD', 'COFFEE', 'SUGAR'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

In [5]:
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(data).transform(data)
df = pd.DataFrame(a_data,columns=a.columns_)
df.head()

Unnamed: 0,BISCUIT,BREAD,CEDEVITA,COCK,COFFEE,CORNFLAKES,EGGS,JAM,MILK,SUGAR,TEA
0,True,True,False,False,False,False,False,False,True,False,False
1,True,True,False,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,True,False,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,True,False,False,False,True


Nastavimo mejo za podporo, recimo $20\%$, in izračunamo podporo.

In [6]:
df_ap = apriori(df, min_support = 0.2, use_colnames = True, verbose = 1)
df_ap


Processing 72 combinations | Sampling itemset size 2
Processing 42 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.65,(BREAD)
2,0.2,(CEDEVITA)
3,0.4,(COFFEE)
4,0.3,(CORNFLAKES)
5,0.25,(EGGS)
6,0.25,(MILK)
7,0.3,(SUGAR)
8,0.35,(TEA)
9,0.2,"(BISCUIT, BREAD)"


Zdaj lahko ustvarimo vsa povezovalna pravila, ki imajo vsaj $60\%$ zaupanja:

In [7]:
df_ar = association_rules(df_ap, metric = "confidence", min_threshold = 0.6)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75,0.25
1,(SUGAR),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05,0.035714
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
3,(SUGAR),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8,0.571429
4,(EGGS),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25,0.75


In [8]:
for i in range(df_ar.shape[0]):
    row = df_ar.iloc[i]
    left, right = row['antecedents'], row['consequents']
    supp, conf = row['support'], row['confidence']
    left_str =  ', '.join(name for name in left)
    right_str = ', '.join(name for name in right)
    print(left_str + " -> " + right_str+ ' (supp: {}, conf: {:.2f})'.format(supp, conf))

MILK -> BREAD (supp: 0.2, conf: 0.80)
SUGAR -> BREAD (supp: 0.2, conf: 0.67)
CORNFLAKES -> COFFEE (supp: 0.2, conf: 0.67)
SUGAR -> COFFEE (supp: 0.2, conf: 0.67)
EGGS -> TEA (supp: 0.2, conf: 0.80)


##### Vprašanje 5-5-1
Filtriraj pravila. Poišči vsa tista pravila, ki napovejo nabavo kruha.

In [9]:
df_ar[df_ar['antecedents'].isin([frozenset({'BREAD'})])]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


# Klasifikacijska pravila

Videli smo, kako se dobi povezovalna pravila na redkih podatki, tokrat si bomo ogledali še postopek na polnih podatkih. 

In [10]:
zoo = pd.read_table('podatki/zoo.tab', skiprows=[1,2])
zoo = zoo.drop('name', axis=1)
zoo.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
3,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
4,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal


Tabelo spremenimo v seznam seznamov, kjer so elementi atribut in vrednost.

Ker so v matriki tudi nule, bomo to upoštevali pri poimenovanju vrednosti.

In [11]:
data = []
for i in range(len(zoo)):
    line = []
    attributes = zoo.iloc[i]
    for col in zoo.columns:
        line.append(col+'='+str(attributes[col]))
    data.append(line)
data[:2]

[['hair=1',
  'feathers=0',
  'eggs=0',
  'milk=1',
  'airborne=0',
  'aquatic=0',
  'predator=1',
  'toothed=1',
  'backbone=1',
  'breathes=1',
  'venomous=0',
  'fins=0',
  'legs=4',
  'tail=0',
  'domestic=0',
  'catsize=1',
  'type=mammal'],
 ['hair=1',
  'feathers=0',
  'eggs=0',
  'milk=1',
  'airborne=0',
  'aquatic=0',
  'predator=0',
  'toothed=1',
  'backbone=1',
  'breathes=1',
  'venomous=0',
  'fins=0',
  'legs=4',
  'tail=1',
  'domestic=0',
  'catsize=1',
  'type=mammal']]

Od tu naprej je posopek poznan.

In [12]:
a = TransactionEncoder()
a_data = a.fit(data).transform(data)
df = pd.DataFrame(a_data,columns=a.columns_)
df.head()

Unnamed: 0,airborne=0,airborne=1,aquatic=0,aquatic=1,backbone=0,backbone=1,breathes=0,breathes=1,catsize=0,catsize=1,...,toothed=1,type=amphibian,type=bird,type=fish,type=insect,type=invertebrate,type=mammal,type=reptile,venomous=0,venomous=1
0,True,False,True,False,False,True,False,True,False,True,...,True,False,False,False,False,False,True,False,True,False
1,True,False,True,False,False,True,False,True,False,True,...,True,False,False,False,False,False,True,False,True,False
2,True,False,False,True,False,True,True,False,True,False,...,True,False,False,True,False,False,False,False,True,False
3,True,False,True,False,False,True,False,True,False,True,...,True,False,False,False,False,False,True,False,True,False
4,True,False,True,False,False,True,False,True,False,True,...,True,False,False,False,False,False,True,False,True,False


Tokrat lahko izberemo visoko podporo.

In [13]:
df_ap = apriori(df, min_support = 0.6, use_colnames = True, verbose = 1)
df_ap.head()


Processing 132 combinations | Sampling itemset size 2
Processing 252 combinations | Sampling itemset size 3
Processing 8 combinations | Sampling itemset size 4


Unnamed: 0,support,itemsets
0,0.762712,(airborne=0)
1,0.762712,(backbone=1)
2,0.79661,(breathes=1)
3,0.610169,(catsize=0)
4,0.864407,(domestic=0)


In tudi visoko zaupanje.

In [14]:
df_ar = association_rules(df_ap, metric = "confidence", min_threshold = 0.9)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(airborne=0),(feathers=0),0.762712,0.79661,0.694915,0.911111,1.143735,0.087331,2.288136,0.529617
1,(tail=1),(backbone=1),0.644068,0.762712,0.627119,0.973684,1.276608,0.13588,9.016949,0.608752
2,(backbone=1),(venomous=0),0.762712,0.864407,0.694915,0.911111,1.054031,0.035622,1.525424,0.216028
3,(breathes=1),(fins=0),0.79661,0.864407,0.745763,0.93617,1.08302,0.057167,2.124294,0.376894
4,(eggs=1),(domestic=0),0.661017,0.864407,0.610169,0.923077,1.067873,0.038782,1.762712,0.1875
5,(hair=0),(domestic=0),0.644068,0.864407,0.610169,0.947368,1.095975,0.053433,2.576271,0.246032
6,(milk=0),(domestic=0),0.677966,0.864407,0.627119,0.925,1.070098,0.04108,1.80791,0.203414
7,(milk=0),(eggs=1),0.677966,0.661017,0.644068,0.95,1.437179,0.195921,6.779661,0.944598
8,(eggs=1),(milk=0),0.661017,0.677966,0.644068,0.974359,1.437179,0.195921,12.559322,0.897368
9,(hair=0),(milk=0),0.644068,0.677966,0.627119,0.973684,1.436184,0.190463,12.237288,0.853282


Opravka imamo s podatki z razredom. Lahko ustvarimo pravila, ki napovedujejo razred?

Najprej ustvarimo seznam z vsemi razredi.

In [15]:
classes = []
for t in pd.unique(zoo['type']):
    classes.append(frozenset({'type='+t}))

classes

[frozenset({'type=mammal'}),
 frozenset({'type=fish'}),
 frozenset({'type=bird'}),
 frozenset({'type=invertebrate'}),
 frozenset({'type=insect'}),
 frozenset({'type=amphibian'}),
 frozenset({'type=reptile'})]

Želimo postavke z $>30\%$ podpore:

In [16]:
df_ap = apriori(df, min_support = 0.3, use_colnames = True, verbose = 1)
df_ap


Processing 650 combinations | Sampling itemset size 2
Processing 4401 combinations | Sampling itemset size 3
Processing 11248 combinations | Sampling itemset size 4
Processing 11835 combinations | Sampling itemset size 5
Processing 5874 combinations | Sampling itemset size 6
Processing 1197 combinations | Sampling itemset size 7
Processing 56 combinations | Sampling itemset size 8


Unnamed: 0,support,itemsets
0,0.762712,(airborne=0)
1,0.593220,(aquatic=0)
2,0.406780,(aquatic=1)
3,0.762712,(backbone=1)
4,0.796610,(breathes=1)
...,...,...
1713,0.355932,"(domestic=0, milk=0, venomous=0, fins=0, breat..."
1714,0.305085,"(type=mammal, feathers=0, venomous=0, eggs=0, ..."
1715,0.305085,"(catsize=0, domestic=0, milk=0, venomous=0, fi..."
1716,0.355932,"(domestic=0, milk=0, venomous=0, toothed=0, fi..."


Zdaj lahko ustvarimo vsa klasifikacijska pravila, ki imajo posledico enako eni od vrednosti razreda in $>80\%$ zaupanja:

In [17]:
df_ar = association_rules(df_ap, metric = "confidence", min_threshold = 0.8)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(aquatic=1),(airborne=0),0.406780,0.762712,0.355932,0.875000,1.147222,0.045677,1.898305,0.216327
1,(backbone=1),(airborne=0),0.762712,0.762712,0.610169,0.800000,1.048889,0.028440,1.186441,0.196429
2,(airborne=0),(backbone=1),0.762712,0.762712,0.610169,0.800000,1.048889,0.028440,1.186441,0.196429
3,(catsize=1),(airborne=0),0.389831,0.762712,0.338983,0.869565,1.140097,0.041655,1.819209,0.201389
4,(airborne=0),(domestic=0),0.762712,0.864407,0.661017,0.866667,1.002614,0.001724,1.016949,0.010989
...,...,...,...,...,...,...,...,...,...,...
12944,"(toothed=1, milk=1)","(feathers=0, type=mammal, backbone=1, venomous...",0.305085,0.305085,0.305085,1.000000,3.277778,0.212008,inf,1.000000
12945,"(breathes=1, milk=1)","(feathers=0, type=mammal, backbone=1, venomous...",0.322034,0.305085,0.305085,0.947368,3.105263,0.206837,13.203390,1.000000
12946,(type=mammal),"(feathers=0, backbone=1, venomous=0, eggs=0, t...",0.322034,0.305085,0.305085,0.947368,3.105263,0.206837,13.203390,1.000000
12947,(eggs=0),"(type=mammal, feathers=0, backbone=1, venomous...",0.338983,0.305085,0.305085,0.900000,2.950000,0.201666,6.949153,1.000000


Zdaj lahko ustvarimo vsa povezovalna pravila, ki imajo posledico enako eni od vrednosti razreda

In [18]:
class_rules = df_ar[df_ar['consequents'].isin(classes)]
class_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
78,(eggs=0),(type=mammal),0.338983,0.322034,0.305085,0.9,2.794737,0.195921,6.779661,0.97151
114,(hair=1),(type=mammal),0.355932,0.322034,0.305085,0.857143,2.661654,0.190463,4.745763,0.969298
121,(milk=1),(type=mammal),0.322034,0.322034,0.322034,1.0,3.105263,0.218328,inf,1.0
267,"(airborne=0, milk=1)",(type=mammal),0.305085,0.322034,0.305085,1.0,3.105263,0.206837,inf,0.97561
439,"(backbone=1, eggs=0)",(type=mammal),0.322034,0.322034,0.305085,0.947368,2.941828,0.201379,12.881356,0.973611


Prvih pet lepše izpišemo.

In [19]:
for i in range(5):
    row = class_rules.iloc[i]
    left, right = row['antecedents'], row['consequents']
    supp, conf = row['support'], row['confidence']
    left_str =  ', '.join(name for name in left)
    right_str = ', '.join(name for name in right)
    print(left_str + " -> " + right_str+ ' (supp: {}, conf: {:.2f})'.format(supp, conf))

eggs=0 -> type=mammal (supp: 0.3050847457627119, conf: 0.90)
hair=1 -> type=mammal (supp: 0.3050847457627119, conf: 0.86)
milk=1 -> type=mammal (supp: 0.3220338983050847, conf: 1.00)
airborne=0, milk=1 -> type=mammal (supp: 0.3050847457627119, conf: 1.00)
backbone=1, eggs=0 -> type=mammal (supp: 0.3050847457627119, conf: 0.95)
