In [1]:
from orangecontrib.associate.fpgrowth import *
import Orange

# Classification rules

Videli smo, kako se dobi povezovalna pravila na redkih podatki, tokrat si bomo ogledali še postopek na polnih podatkih. 

In [2]:
data = Orange.data.Table('zoo')
data

[[1, 0, 0, 1, 0, ... | mammal] {aardvark},
 [1, 0, 0, 1, 0, ... | mammal] {antelope},
 [0, 0, 1, 0, 0, ... | fish] {bass},
 [1, 0, 0, 1, 0, ... | mammal] {bear},
 [1, 0, 0, 1, 0, ... | mammal] {boar},
 ...
]

Ker so v matriki tudi nule, bomo to upoštevali pri poimenovanju vrednosti.

In [3]:
X, mapping = OneHot.encode(data)
names = {item: ('{}={}').format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}
names

{0: 'hair=0',
 1: 'hair=1',
 2: 'feathers=0',
 3: 'feathers=1',
 4: 'eggs=0',
 5: 'eggs=1',
 6: 'milk=0',
 7: 'milk=1',
 8: 'airborne=0',
 9: 'airborne=1',
 10: 'aquatic=0',
 11: 'aquatic=1',
 12: 'predator=0',
 13: 'predator=1',
 14: 'toothed=0',
 15: 'toothed=1',
 16: 'backbone=0',
 17: 'backbone=1',
 18: 'breathes=0',
 19: 'breathes=1',
 20: 'venomous=0',
 21: 'venomous=1',
 22: 'fins=0',
 23: 'fins=1',
 24: 'legs=0',
 25: 'legs=2',
 26: 'legs=4',
 27: 'legs=5',
 28: 'legs=6',
 29: 'legs=8',
 30: 'tail=0',
 31: 'tail=1',
 32: 'domestic=0',
 33: 'domestic=1',
 34: 'catsize=0',
 35: 'catsize=1'}

Od tu naprej je postopek že poznan. Zaradi narave podatkov lahko izberemo višjo podporo in zaupanje.

In [4]:
itemsets = {}
for itemset, support in frequent_itemsets(X, 0.7):
    itemsets[itemset] = support
for rule in association_rules(itemsets, 0.8):
        left, right, support, confidence = rule
        left_str =  ', '.join(names[i] for i in sorted(left))
        right_str = ', '.join(names[i] for i in sorted(right))
        print(left_str+" -> "+right_str)

venomous=0, fins=0 -> breathes=1
breathes=1, fins=0 -> venomous=0
fins=0 -> breathes=1, venomous=0
breathes=1, venomous=0 -> fins=0
breathes=1 -> venomous=0, fins=0
venomous=0, tail=1 -> backbone=1
backbone=1, tail=1 -> venomous=0
tail=1 -> backbone=1, venomous=0
backbone=1, venomous=0 -> tail=1
backbone=1 -> venomous=0, tail=1
feathers=0 -> domestic=0
domestic=0 -> feathers=0
feathers=0 -> airborne=0
airborne=0 -> feathers=0
backbone=1 -> domestic=0
domestic=0 -> backbone=1
venomous=0 -> domestic=0
domestic=0 -> venomous=0
feathers=0 -> venomous=0
airborne=0 -> venomous=0
venomous=0 -> backbone=1
backbone=1 -> venomous=0
venomous=0 -> breathes=1
breathes=1 -> venomous=0
fins=0 -> domestic=0
domestic=0 -> fins=0
fins=0 -> breathes=1
breathes=1 -> fins=0
fins=0 -> venomous=0
venomous=0 -> fins=0
tail=1 -> backbone=1
backbone=1 -> tail=1
tail=1 -> venomous=0


Opravka imamo s podatki z razredom. Lahko ustvarimo pravila, ki napovedujejo razred?

V `OneHot.encode` dodamo parameter `include_class=True`, da se upošteva tudi razred.

In [5]:
X, mapping = OneHot.encode(data, include_class=True)

Želimo postavke z >40% podpore:

In [6]:
itemsets = dict(frequent_itemsets(X, .4))
len(itemsets)

520

The transaction-coded items corresponding to class values are:

In [7]:
class_items = {item 
               for item, var, _ in OneHot.decode(mapping, data, mapping) 
               if var is data.domain.class_var}
sorted(class_items)

[36, 37, 38, 39, 40, 41, 42]

That makes sense as our class variable has seven values:

In [8]:
data.domain.class_var.values

['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile']

Now we can generate all association rules that have consequent equal to one of the class values and >80% confidence (i.e. classification rules):

In [9]:
rules = [(P, Q, supp, conf) 
         for P, Q, supp, conf in association_rules(itemsets, .8) 
         if len(Q) == 1 and Q & class_items]
len(rules)
rules

[(frozenset({2, 7, 17, 19, 20}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 17, 19}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 17, 20}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 19, 20}), frozenset({41}), 41, 1.0),
 (frozenset({2, 17, 19, 20}), frozenset({41}), 41, 0.8723404255319149),
 (frozenset({7, 17, 19, 20}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 17}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 19}), frozenset({41}), 41, 1.0),
 (frozenset({2, 17, 19}), frozenset({41}), 41, 0.8367346938775511),
 (frozenset({7, 17, 19}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7, 20}), frozenset({41}), 41, 1.0),
 (frozenset({7, 17, 20}), frozenset({41}), 41, 1.0),
 (frozenset({7, 19, 20}), frozenset({41}), 41, 1.0),
 (frozenset({2, 7}), frozenset({41}), 41, 1.0),
 (frozenset({7, 17}), frozenset({41}), 41, 1.0),
 (frozenset({7, 19}), frozenset({41}), 41, 1.0),
 (frozenset({7, 20}), frozenset({41}), 41, 1.0),
 (frozenset({7}), frozenset({41}), 41, 1.0)]

To make them more helpful, we can use mapping to transform the rules’ items back into table domain values, e.g. for first five rules:

In [10]:
names = {item: '{}={}'.format(var.name, val) 
         for item, var, val in OneHot.decode(mapping, data, mapping)}
for ante, cons, supp, conf in rules[:5]:
                              print(', '.join(names[i] for i in ante), '-->',
                                    names[next(iter(cons))],
                                    '(supp: {}, conf: {})'.format(supp, conf))

feathers=0, milk=1, backbone=1, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 1.0)
backbone=1, feathers=0, breathes=1, milk=1 --> type=mammal (supp: 41, conf: 1.0)
backbone=1, feathers=0, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
backbone=1, feathers=0, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 0.8723404255319149)


# CN2

In [11]:
learner = Orange.classification.CN2Learner()
classifier = learner(data)

In [12]:
# consider up to 10 solution streams at one time
learner.rule_finder.search_algorithm.beam_width = 10

# continuous value space is constrained to reduce computation time
learner.rule_finder.search_strategy.bound_continuous = True

# found rules must cover at least 15 examples
learner.rule_finder.general_validator.min_covered_examples = 15

# found rules must combine at most 2 selectors (conditions)
learner.rule_finder.general_validator.max_rule_length = 2

classifier = learner(data)

Induced rules can be quickly reviewed and interpreted. They are each of the form ‘if cond then predict class”. That is, a conjunction of selectors followed by the predicted class.

In [13]:
for rule in classifier.rule_list:
    print(rule, rule.curr_class_dist.tolist())

IF feathers!=0 THEN type=bird  [0, 20, 0, 0, 0, 0, 0]
IF milk!=0 THEN type=mammal  [0, 0, 0, 0, 0, 41, 0]
IF legs==0 AND toothed!=0 THEN type=fish  [0, 0, 13, 0, 0, 0, 3]
IF backbone==0 AND domestic==0 THEN type=invertebrate  [0, 0, 0, 7, 10, 0, 0]
IF TRUE THEN type=mammal  [4, 20, 13, 8, 10, 41, 5]


If no other rules fire, default rule (majority classification) is used. Specific to each individual rule inducer, the application of the default rule varies.