# Pós-Graduação - Ciência de Dados & Big Data

## Pontifícia Universidade Católica de Minas Gerais (PUC-MG

### Aluno: Victor Hugo Negrisoli

### Regras de Associação

Importando os dados necessários

In [71]:
import numpy as np
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#### Definindo o dataset

In [6]:
dataset = [
    ['Leite', 'Cebola', 'Batata', 'Feijão', 'Ovos', 'Iogurte'],
    ['Arroz', 'Cebola', 'Batata', 'Feijão', 'Ovos', 'Iogurte'],
    ['Leite', 'Maçã', 'Feijão', 'Ovos'],
    ['Leite', 'Milho', 'Feijão', 'Iogurte'],
    ['Milho', 'Cebola', 'Feijão', 'Sorvete', 'Ovos']
]

In [8]:
dataset

[['Leite', 'Cebola', 'Batata', 'Feijão', 'Ovos', 'Iogurte'],
 ['Arroz', 'Cebola', 'Batata', 'Feijão', 'Ovos', 'Iogurte'],
 ['Leite', 'Maçã', 'Feijão', 'Ovos'],
 ['Leite', 'Milho', 'Feijão', 'Iogurte'],
 ['Milho', 'Cebola', 'Feijão', 'Sorvete', 'Ovos']]

#### Criando o modelo e definindo os itemsets

In [10]:
transactionEncoder = TransactionEncoder()
transform = transactionEncoder.fit(dataset).transform(dataset)

In [11]:
transform

array([[False,  True,  True,  True,  True,  True, False, False,  True,
        False],
       [ True,  True,  True,  True,  True, False, False, False,  True,
        False],
       [False, False, False,  True, False,  True,  True, False,  True,
        False],
       [False, False, False,  True,  True,  True, False,  True, False,
        False],
       [False, False,  True,  True, False, False, False,  True,  True,
         True]])

In [18]:
dados_transformados = pd.DataFrame(transform, columns = transactionEncoder.columns_)
dados_transformados.head()

Unnamed: 0,Arroz,Batata,Cebola,Feijão,Iogurte,Leite,Maçã,Milho,Ovos,Sorvete
0,False,True,True,True,True,True,False,False,True,False
1,True,True,True,True,True,False,False,False,True,False
2,False,False,False,True,False,True,True,False,True,False
3,False,False,False,True,True,True,False,True,False,False
4,False,False,True,True,False,False,False,True,True,True


In [20]:
itemsets_frequentes = apriori(dados_transformados, min_support=0.6, use_colnames=True)
itemsets_frequentes

Unnamed: 0,support,itemsets
0,0.6,(Cebola)
1,1.0,(Feijão)
2,0.6,(Iogurte)
3,0.6,(Leite)
4,0.8,(Ovos)
5,0.6,"(Feijão, Cebola)"
6,0.6,"(Ovos, Cebola)"
7,0.6,"(Feijão, Iogurte)"
8,0.6,"(Feijão, Leite)"
9,0.8,"(Ovos, Feijão)"


#### Definindo as Regras de Associação com métrica de confiança de 0.7

In [22]:
regras_associacao = association_rules(itemsets_frequentes, metric='confidence', min_threshold=0.7)
regras_associacao

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Cebola),(Feijão),0.6,1.0,0.6,1.0,1.0,0.0,inf
1,(Ovos),(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,(Cebola),(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(Iogurte),(Feijão),0.6,1.0,0.6,1.0,1.0,0.0,inf
4,(Leite),(Feijão),0.6,1.0,0.6,1.0,1.0,0.0,inf
5,(Ovos),(Feijão),0.8,1.0,0.8,1.0,1.0,0.0,inf
6,(Feijão),(Ovos),1.0,0.8,0.8,0.8,1.0,0.0,1.0
7,"(Ovos, Feijão)",(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6
8,"(Ovos, Cebola)",(Feijão),0.6,1.0,0.6,1.0,1.0,0.0,inf
9,"(Feijão, Cebola)",(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf


#### Definindo as Regras de Associação com métrica de lift de 1.2

In [23]:
regras_associacao = association_rules(itemsets_frequentes, metric='lift', min_threshold=1.2)
regras_associacao

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Ovos),(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6
1,(Cebola),(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf
2,"(Ovos, Feijão)",(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6
3,"(Feijão, Cebola)",(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(Ovos),"(Feijão, Cebola)",0.8,0.6,0.6,0.75,1.25,0.12,1.6
5,(Cebola),"(Ovos, Feijão)",0.6,0.8,0.6,1.0,1.25,0.12,inf


In [26]:
regras_associacao['antecedents_len'] = regras_associacao['antecedents'].apply(lambda x : len(x))
regras_associacao

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_len
0,(Ovos),(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1
1,(Cebola),(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf,1
2,"(Ovos, Feijão)",(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2
3,"(Feijão, Cebola)",(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
4,(Ovos),"(Feijão, Cebola)",0.8,0.6,0.6,0.75,1.25,0.12,1.6,1
5,(Cebola),"(Ovos, Feijão)",0.6,0.8,0.6,1.0,1.25,0.12,inf,1


In [31]:
regras_associacao[
    (regras_associacao['antecedents_len'] >= 2) & 
    (regras_associacao['confidence'] > 0.75) &
    (regras_associacao['lift'] > 1.2)
]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_len
3,"(Feijão, Cebola)",(Ovos),0.6,0.8,0.6,1.0,1.25,0.12,inf,2


In [32]:
regras_associacao[regras_associacao['antecedents'] == {'Ovos', 'Feijão'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_len
2,"(Ovos, Feijão)",(Cebola),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2


### Análise de uma cesta de compras em Python

In [35]:
dados = pd.read_excel('dados/Online Retail.xlsx')
dados.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [36]:
dados.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


#### Preparando os dados

Remove dados de espaços vazios no início e no fim da String

In [37]:
dados['Description'] = dados['Description'].str.strip()

Remove campos nulos na coluna InvoiceNo

In [38]:
dados.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

Converte a coluna InvoiceNo para String

In [42]:
dados['InvoiceNo'] = dados['InvoiceNo'].astype('str')

Remove todos os registros que contenham a letra 'C' em InvoiceNo, sendo que estas serão canceladas.

In [46]:
dados = dados[~dados['InvoiceNo'].str.contains('C')]

In [47]:
dados.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,532621.0,532621.0,397924.0
mean,10.239972,3.847621,15294.315171
std,159.593551,41.758023,1713.169877
min,-9600.0,-11062.06,12346.0
25%,1.0,1.25,13969.0
50%,3.0,2.08,15159.0
75%,10.0,4.13,16795.0
max,80995.0,13541.33,18287.0


Agrupando o dataset por InvoiceNo, Description e Quantity para o país França.

In [58]:
dados_agrupados = (dados[dados['Country'] == "France"]
                   .groupby(['InvoiceNo', 'Description'])['Quantity']
                   .sum()
                   .unstack()
                   .reset_index()
                   .fillna(0)
                   .set_index('InvoiceNo'))

In [59]:
dados_agrupados.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Definindo 0 para os valores menores ou iguais a 0, e 1 para os valores maiores ou iguais a 1

In [60]:
def resumir_unidades_em_zero_e_um(x):
    if (x <= 0):
        return 0
    if (x >= 1):
        return 1

In [61]:
itemsets = dados_agrupados.applymap(resumir_unidades_em_zero_e_um)

In [62]:
itemsets.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
itemsets.describe()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,...,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,0.030612,0.015306,0.002551,0.005102,0.015306,0.017857,0.017857,0.002551,0.022959,0.015306,...,0.007653,0.005102,0.002551,0.002551,0.002551,0.017857,0.002551,0.010204,0.002551,0.005102
std,0.172485,0.122924,0.050508,0.071337,0.122924,0.132601,0.132601,0.050508,0.149965,0.122924,...,0.087258,0.071337,0.050508,0.050508,0.050508,0.132601,0.050508,0.100627,0.050508,0.071337
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Aplicando o algoritmo apriori e de regras de associação

In [67]:
itemsets_frequentes = apriori(itemsets, min_support=0.07, use_colnames=True)

regras_associacao = association_rules(itemsets_frequentes, metric='lift', min_threshold=1)

In [68]:
regras_associacao.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.096939,0.102041,0.07398,0.763158,7.478947,0.064088,3.791383
1,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.102041,0.096939,0.07398,0.725,7.478947,0.064088,3.283859
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.094388,0.096939,0.079082,0.837838,8.642959,0.069932,5.568878
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.096939,0.094388,0.079082,0.815789,8.642959,0.069932,4.916181
4,(ALARM CLOCK BAKELIKE GREEN),(POSTAGE),0.096939,0.765306,0.084184,0.868421,1.134737,0.009996,1.783673


Visualizando as regras com lift acima de 6 e confiança acima de 0.8

In [69]:
regras_associacao[(regras_associacao['lift'] > 6) & (regras_associacao['confidence'] > 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.094388,0.096939,0.079082,0.837838,8.642959,0.069932,5.568878
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.096939,0.094388,0.079082,0.815789,8.642959,0.069932,4.916181
76,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),0.127551,0.137755,0.122449,0.96,6.968889,0.104878,21.556122
77,(SET/6 RED SPOTTY PAPER CUPS),(SET/6 RED SPOTTY PAPER PLATES),0.137755,0.127551,0.122449,0.888889,6.968889,0.104878,7.852041
78,"(ALARM CLOCK BAKELIKE RED, POSTAGE)",(ALARM CLOCK BAKELIKE GREEN),0.086735,0.096939,0.071429,0.823529,8.495356,0.063021,5.117347
80,"(ALARM CLOCK BAKELIKE GREEN, POSTAGE)",(ALARM CLOCK BAKELIKE RED),0.084184,0.094388,0.071429,0.848485,8.989353,0.063483,5.977041
114,"(POSTAGE, SET/6 RED SPOTTY PAPER PLATES)",(SET/6 RED SPOTTY PAPER CUPS),0.107143,0.137755,0.102041,0.952381,6.91358,0.087281,18.107143
115,"(POSTAGE, SET/6 RED SPOTTY PAPER CUPS)",(SET/6 RED SPOTTY PAPER PLATES),0.117347,0.127551,0.102041,0.869565,6.817391,0.087073,6.688776
120,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796
121,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
