![logo](images/untumbes.PNG)

<center><b>Prof. Dr. Jorge Zavaleta - zavaleta.jorge@gmail.com</b></center>

# Algoritmo Apriori

In [None]:
#librarys
import pandas as pd
import numpy as np
import random
#
import heapq
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
#
from surprise import Dataset, Reader, KNNBasic
# ml
from surprise.model_selection import train_test_split

# graphics
import matplotlib.pylab as plt
%matplotlib inline
#
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Passo 1: Gerar um conjunto de dados aleatório
data = [['Leite', 'Cebola', 'Noz-Moscada', 'Feijão', 'Ovos', 'Iogurte'],
        ['Dill', 'Cebola', 'Noz-Moscada', 'Feijão', 'Ovos', 'Iogurte'],
        ['Leite', 'Maçã', 'Feijão', 'Ovos'],
        ['Leite', 'Milho', 'Noz-Moscada', 'Feijão', 'Iogurte'],
        ['Milho', 'Cebola', 'Cebola', 'Feijão', 'Sorvete', 'Ovos']]
data

In [None]:
# Passo 2: Pré-processamento dos dados
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

In [None]:
# Passo 3: Aplicar o algoritmo Apriori
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets

In [None]:
# Collecting the inferred rules in a dataframe 
verduras_rules = association_rules(frequent_itemsets, metric ="confidence", min_threshold = 0.5) 
verduras_rules = verduras_rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
verduras_rules.head(10) 

>## Análisis?

>## Ventas online - Ejemplo 2

In [None]:
# Loading the Data 
data = pd.read_excel('data/Online_Retail.xlsx') 
data.head() 

In [None]:
#dimensions
data.shape

In [None]:
# Exploring the columns of the data 
data.columns 

In [None]:
# Exploring the different regions of transactions 
data.Country.unique() 

>### Limpieza de datos

In [None]:
# Stripping extra spaces in the description 
data['Description'] = data['Description'].str.strip() 
data.shape

In [None]:
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
data.shape

In [None]:
# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')] 
data.shape

>### Dividir los datos según la región de la transacción

In [None]:
# Transactions done in France 
cesta_France = (data[data['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
cesta_France.head()

In [None]:
# Transactions done in the United Kingdom 
cesta_UK = (data[data['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [None]:
# Transactions done in Portugal 
cesta_Por = (data[data['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [None]:
cesta_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

>### Codificación HOT de los datos

In [None]:
# Defining the hot encoding function to make the data suitable  
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

In [None]:
# Encoding the datasets 
cesta_encoded = cesta_France.applymap(hot_encode) 
cesta_France = cesta_encoded 
cesta_France.head()

In [None]:
cesta_encoded = cesta_UK.applymap(hot_encode) 
cesta_UK = cesta_encoded 

In [None]:
cesta_encoded = cesta_Por.applymap(hot_encode) 
cesta_Por = cesta_encoded 

In [None]:
cesta_encoded = cesta_Sweden.applymap(hot_encode) 
cesta_Sweden = cesta_encoded 

>### Construyendo los modelos y analizando los resultados
>#### Francia

In [None]:
# Building the model - freciencia de items
frq_items_france = apriori(cesta_France, min_support = 0.05, use_colnames = True) 

In [None]:
# Collecting the inferred rules in a dataframe 
rules_france = association_rules(frq_items_france, metric ="lift", min_threshold = 1) 
rules_france = rules_france.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules_france.head()

>#### Reino Unido (United Kingdom)

In [None]:
## Building the model
frq_items_ru = apriori(cesta_UK, min_support = 0.01, use_colnames = True) 
# Collecting the inferred rules in a dataframe 
rules_ru = association_rules(frq_items_ru, metric ="lift", min_threshold = 1) 
rules_ru = rules_ru.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules_ru.head() 

>#### Portugal

In [None]:
## Building the model
frq_items_pt = apriori(cesta_Por, min_support = 0.05, use_colnames = True) 
# Collecting the inferred rules in a dataframe 
rules_pt = association_rules(frq_items_pt, metric ="lift", min_threshold = 1) 
rules_pt = rules_pt.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules_pt.head() 

>### Suecia

In [None]:
## Building the model
frq_items_su = apriori(cesta_Sweden, min_support = 0.05, use_colnames = True) 
# Collecting the inferred rules in a dataframe 
rules_su = association_rules(frq_items_su, metric ="lift", min_threshold = 1) 
rules_su = rules_su.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules_su.head() 

>## Análisis?

>## Minorista - Ejemplo 3

In [None]:
# Load and preprocess data set
mino_df = pd.read_csv('data/retail_dataset.csv', sep=',')
mino_df.head(10)

In [None]:
# data shape
mino_df.shape

>## Limpieza de datos

In [None]:
#tratando valores NaN
mino_df.replace(np.nan,0,inplace=True)
mino_df.head(10)

In [None]:
# crear funcion lambda para mover los valosres x!=0 para uma lista
def removerTodosZerosLista(lista):
    return list(filter(lambda x: x!=0, lista))

In [None]:
#transformando en lista
lista_todas_transacciones = []
for index, row in mino_df.iterrows():
    lista_de_trans = row.values.tolist()
    lista_de_trans = removerTodosZerosLista(lista_de_trans)
    # adiciona a la lista general
    lista_todas_transacciones.append(lista_de_trans)
    
# visualiza lista
lista_todas_transacciones[0:10]

>## Procesamiento de datos

En el análisis de asociación, el dataset de datos debe ser 1 y 0 o un tipo de estructura de datos booleano como Verdadero/Falso.

In [None]:
# One Hot Encoding process has been done.
temp = TransactionEncoder()
temp_df = temp.fit(lista_todas_transacciones).transform(lista_todas_transacciones)
minorista_df = pd.DataFrame(temp_df,columns=temp.columns_)
minorista_df.head()

>## Algoritmo apriori

El dataset de datos ahora es adecuado para el análisis de asociaciones. El siguiente paso será calcular e interpretar los valores de soporte y confianza.

In [None]:
# We set our support value as 20%
mino_items = apriori(minorista_df, min_support=0.20, use_colnames=True, verbose = 1)
mino_items.sort_values(by = "support", ascending = False)
mino_items[['itemsets','support']][0:15]

In [None]:
# Collecting the inferred rules in a dataframe 
mino_rules = association_rules(mino_items, metric ="confidence", min_threshold = 0.5) 
mino_rules = mino_rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
mino_rules.head(10) 

>### Análisis

> Podemos interpretar el dataset resultante de la siguiente manera:
> 
> Valor de soporte -> muestra que quien compra Leche y Carne compra Queso tiene un soporte de 0.2
> 
> Valor de Confianza -> muestra que el 83% de los clientes que compran Leche y Carne también compran Queso.
> 
> Valor de elevación -> muestra que las ventas de (Queso) aumentan en 1.65 para las compras con Leche y Carne.

---
<center><b>&copy;Jorge Zavaleta, 2024</b></center>