# Ganho de Informação

In [64]:
import numpy as np
import pandas as pd
import seaborn as sns

## Descrição do dataset mushrooms

In [63]:
mushrooms = pd.read_csv('mushrooms.csv')
print('Total de linhas = {}\nTotal de colunas = {}'.format(mushrooms.shape[0], mushrooms.shape[1]))
mushrooms.head()

Total de linhas = 8124
Total de colunas = 23


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Informações dos atributos:
* **class: edible=e, poisonous=p**

* cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s
* cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s
* cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
* bruises: bruises=t, no=f
* odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
* gill-attachment: attached=a, descending=d, free=f, notched=n
* gill-spacing: close=c, crowded=w, distant=d
* gill-size: broad=b, narrow=n
* gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
* stalk-shape: enlarging=e, tapering=t
* stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
* stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s
* stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s
* stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
* stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
* veil-type: partial=p, universal=u
* veil-color: brown=n, orange=o, white=w, yellow=y
* ring-number: none=n, one=o, two=t
* ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
* spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
* population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
* habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

In [6]:
mushrooms.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

## Entropia

In [36]:
def entropy(dataframe, target):
    n_rows = dataframe.shape[0]
    entropy = 0
    
    for c in dataframe[target].unique():
        n_rows_c = dataframe[dataframe[target]==c].shape[0]
        proportion = n_rows_c / n_rows
        
        if(proportion != 0):
            entropy -= proportion * np.log2(proportion)
        
    return entropy

In [87]:
entropy_root = entropy(mushrooms, 'class')
entropy_root

0.9990678968724603

## Ganho de Informação

In [40]:
def informationGain(dataframe, attribute, target):
    n_rows = dataframe.shape[0]
    information_gain = entropy(dataframe, target)

    for label in dataframe[attribute].unique():
        n_rows_label = dataframe[dataframe[attribute]==label].shape[0]
        proportion = n_rows_label / n_rows

        information_gain -= proportion * entropy(dataframe[dataframe[attribute]==label], target)

    return information_gain

In [61]:
index = mushrooms.drop(columns='class').columns
IG = pd.DataFrame(index=index, data={'Information Gain': [informationGain(mushrooms, att, 'class') for att in index]})
IG.sort_values('Information Gain', ascending=False)

Unnamed: 0,Information Gain
odor,0.906075
spore-print-color,0.480705
gill-color,0.416978
ring-type,0.318022
stalk-surface-above-ring,0.284726
stalk-surface-below-ring,0.271894
stalk-color-above-ring,0.253845
stalk-color-below-ring,0.241416
gill-size,0.230154
population,0.201958


In [102]:
for label in mushrooms['odor'].unique():
    print(label, ':', entropy(mushrooms[mushrooms['odor']==label], 'class'))

p : 0.0
a : 0.0
l : 0.0
n : 0.2141367567812511
f : 0.0
c : 0.0
y : 0.0
s : 0.0
m : 0.0
