### H3 - Existe algum tipo de relação entre produtos que gera uma compra "combinada"?

#### Importando pacotes

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

#### Lendo arquivos

In [2]:
arquivo = "../_data/clean/e_commerce_pedidos.csv"

In [3]:
pedidos_df = pd.read_csv(arquivo)

In [4]:
#Filtrando compras concluídas
pedidos_df = pedidos_df[pedidos_df['status'].isin(['Pagamento aprovado', 'Entregue'])]

In [5]:
pedidos_df.head()

Unnamed: 0.1,Unnamed: 0,id,data,data_pagamento,data_cancelamento,numero_pedido,status,pagamento,produto,sku,...,entrega_cidade,entrega_estado,entrega_cep,utm_source,utm_campaign,utm_medium,utm_content,utm_term,customizacao,id_cliente
0,1,117569438,04/08/2024 18:26:40,04/08/2024 18:27,,1042552530523300,Pagamento aprovado,Pix,BRACELETE AJUSTÁVEL O COM COROA,DFREFSP8P,...,Brasilia,Distrito Federal,72455490,,,,,,,31d69aed10
1,2,117385907,01/08/2024 19:08:02,01/08/2024 19:11,,1042552893870166,Pagamento aprovado,Pix,BRACELETE AJUSTÁVEL O COM COROA,DFREFSP8P,...,Varzea Nova(Santa Rita),Paraiba,58304500,,,,,,,3e109eedd5
8,9,117375808,01/08/2024 15:57:53,01/08/2024 15:59,,1042552398274508,Pagamento aprovado,Pix,BRACELETE AJUSTÁVEL O SIGNATURE,M7ZQ5L9JL,...,Canoas,Rio Grande do Sul,92310-240,,,,,,,ddef8dfacc
9,10,117190391,29/07/2024 16:41:15,29/07/2024 16:42,,1042552902283250,Pagamento aprovado,Pix,BRINCO ARGOLA SOL E LUA,JRM3YWN95,...,Lins,Sao Paulo,16400510,ig,MCLJ,1.0 - CAMP1.0 | LAL1-2% COMPRADORES - M2055BR ...,ADS-VADS_13_BRINCO LOVED COM PINGENTE3 - Copy,frio,,ce08f9bda2
10,11,117079172,27/07/2024 18:45:11,27/07/2024 18:45,,1042552861812035,Pagamento aprovado,Cartão de crédito,BRINCO ARGOLA SOL E LUA,JRM3YWN95,...,Pouso Alegre,Minas Gerais,37561899,,,,,,,c9233b915e


#### Selecionando colunas relevantes
Esta tabela está a nível de sku. Assim temos para cada linha o sku que está incluso no pedido.

In [6]:
pedidos = pedidos_df[['numero_pedido', 'sku']]

In [7]:
pedidos.head()

Unnamed: 0,numero_pedido,sku
0,1042552530523300,DFREFSP8P
1,1042552893870166,DFREFSP8P
8,1042552398274508,M7ZQ5L9JL
9,1042552902283250,JRM3YWN95
10,1042552861812035,JRM3YWN95


### Analisando produtos comprados em conjunto

#### Contabilizando produtos vendidos em conjunto

In [8]:
# Realiza o auto join para encontrar os produtos vendidos juntos no mesmo pedido
df_joined = pedidos.merge(pedidos, on="numero_pedido")

# Filtra as linhas onde os produtos são diferentes
df_joined = df_joined[df_joined['sku_x'] != df_joined['sku_y']]

# Agrupa e conta as combinações de produtos
#size conta linhas e inclui nulos, count conta valores e desconsidera nulos
df_resultado = df_joined.groupby(['sku_x', 'sku_y'])\
                        .agg(contagem=('numero_pedido', 'size'))\
                        .reset_index()\
                        .sort_values(by=['contagem'], ascending=False)

# Renomeia as colunas
df_resultado = df_resultado.rename(columns={'sku_x': 'sku', 'sku_y': 'sku_casado'})

In [9]:
df_resultado.head()

Unnamed: 0,sku,sku_casado,contagem
110,LSRE3YWF5,84WFBE5GN,2
37,84WFBE5GN,HL3J67375,2
28,74F6CQWEW,TMXCUUW38,2
20,74F6CQWEW,2XZUH896Q,2
131,THDCJT4YR,JWDBL2NB2,2


### Similaridade de Produtos

In [10]:
# Tabela Pedido, Produto e Quantidade
similaridade_produtos = pedidos_df[['numero_pedido', 'sku', 'quantidade']]

In [11]:
similaridade_produtos = similaridade_produtos.groupby(['numero_pedido', 'sku'])\
                        .agg(quantidade=('quantidade', 'size'))\
                        .reset_index()\
                        .sort_values(by=['quantidade'], ascending=False)

In [12]:
similaridade_produtos.head()

Unnamed: 0,numero_pedido,sku,quantidade
0,1042552147040598,2QALTPNGA,1
68,1042552829197063,THDCJT4YR,1
66,1042552829197063,3LP9NDQHZ,1
65,1042552828626309,74F6CQWEW,1
64,1042552762823964,HL3J67375,1


In [13]:
# Criando uma matriz de utilidade onde:
# - Linhas representam produtos
# - Colunas representam pedidos
# - Valores representam a quantidade comprada
matriz_utilidade = similaridade_produtos.pivot_table(index='sku',
                                       values='quantidade',
                                       columns='numero_pedido',
                                       fill_value=0)

In [14]:
matriz_utilidade.head()

numero_pedido,1042552147040598,1042552166282916,1042552221071163,1042552243758772,1042552266423422,1042552307781298,1042552314462921,1042552396756288,1042552398274508,1042552402220098,...,1042552870620243,1042552877128662,1042552893870166,1042552895610647,1042552902283250,1042552907925954,1042552912381732,1042552916150952,1042552979257588,1042552979344315
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2QALTPNGA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2XZUH896Q,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3LP9NDQHZ,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
6XY5VVVRE,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74F6CQWEW,0,0,0,0,0,0,1,1,0,0,...,0,1,0,1,0,0,0,0,0,0


In [15]:
# Calculando a similaridade cosseno entre os produtos com base em seus padrões de compra.
matriz_similaridade = cosine_similarity(matriz_utilidade)

In [16]:
print(matriz_similaridade)

[[1.         0.         0.28867513 0.         0.         0.21821789
  0.         0.         0.33333333 0.         0.         0.
  0.         0.         0.         0.         0.23570226 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         1.         0.         0.         0.33333333 0.18898224
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.2236068  0.         0.         0.35355339 0.
  0.5        0.         0.         0.15075567]
 [0.28867513 0.         1.         0.         0.16666667 0.18898224
  0.         0.35355339 0.57735027 0.         0.         0.
  0.         0.35355339 0.         0.         0.         0.
  0.         0.2236068  0.         0.         0.35355339 0.
  0.         0.         0.35355339 0.        ]
 [0.         0.         0.         1.         0.         0.37796447
  0.         0.         0.         0.         0

In [17]:
# Convertendo a matriz de similaridade em um DataFrame
df_similaridade = pd.DataFrame(matriz_similaridade, index=matriz_utilidade.index, columns=matriz_utilidade.index)

In [18]:
df_similaridade.head()

sku,2QALTPNGA,2XZUH896Q,3LP9NDQHZ,6XY5VVVRE,74F6CQWEW,84WFBE5GN,9M294TUTK,D56A2X2M7,DAGRBZZ8K,DFREFSP8P,...,JRM3YWN95,JWDBL2NB2,LSRE3YWF5,M7ZQ5L9JL,P7HXNQNUN,PPW5U8MVC,T5LJ2R3L6,T76T2L3YK,THDCJT4YR,TMXCUUW38
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2QALTPNGA,1.0,0.0,0.288675,0.0,0.0,0.218218,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2XZUH896Q,0.0,1.0,0.0,0.0,0.333333,0.188982,0.0,0.0,0.0,0.0,...,0.0,0.223607,0.0,0.0,0.353553,0.0,0.5,0.0,0.0,0.150756
3LP9NDQHZ,0.288675,0.0,1.0,0.0,0.166667,0.188982,0.0,0.353553,0.57735,0.0,...,0.0,0.223607,0.0,0.0,0.353553,0.0,0.0,0.0,0.353553,0.0
6XY5VVVRE,0.0,0.0,0.0,1.0,0.0,0.377964,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74F6CQWEW,0.0,0.333333,0.166667,0.0,1.0,0.125988,0.0,0.235702,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.201008


In [19]:
# Adicionando uma coluna 'sku' ao DataFrame de similaridade para fácil referência.
df_similaridade['sku'] = df_similaridade.index

In [20]:
# Reordenando as colunas para que a coluna 'sku' seja a primeira.
cols = ['sku'] + [col for col in df_similaridade if col != 'sku']
df_similaridade = df_similaridade[cols]

In [21]:
df_similaridade.head()

sku,sku,2QALTPNGA,2XZUH896Q,3LP9NDQHZ,6XY5VVVRE,74F6CQWEW,84WFBE5GN,9M294TUTK,D56A2X2M7,DAGRBZZ8K,...,JRM3YWN95,JWDBL2NB2,LSRE3YWF5,M7ZQ5L9JL,P7HXNQNUN,PPW5U8MVC,T5LJ2R3L6,T76T2L3YK,THDCJT4YR,TMXCUUW38
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2QALTPNGA,2QALTPNGA,1.0,0.0,0.288675,0.0,0.0,0.218218,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2XZUH896Q,2XZUH896Q,0.0,1.0,0.0,0.0,0.333333,0.188982,0.0,0.0,0.0,...,0.0,0.223607,0.0,0.0,0.353553,0.0,0.5,0.0,0.0,0.150756
3LP9NDQHZ,3LP9NDQHZ,0.288675,0.0,1.0,0.0,0.166667,0.188982,0.0,0.353553,0.57735,...,0.0,0.223607,0.0,0.0,0.353553,0.0,0.0,0.0,0.353553,0.0
6XY5VVVRE,6XY5VVVRE,0.0,0.0,0.0,1.0,0.0,0.377964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74F6CQWEW,74F6CQWEW,0.0,0.333333,0.166667,0.0,1.0,0.125988,0.0,0.235702,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.201008


In [22]:
# Despivotando a tabela
df_unpivot = df_similaridade.melt(id_vars=['sku'], var_name='sku_casado', value_name='similaridade')\
                             .sort_values(by=['similaridade'], ascending=False)

#Filtrando produtos diferentes
df_produtos_semelhantes = df_unpivot[df_unpivot['sku'] != df_unpivot['sku_casado']]

# Calculando o valor do 75º percentil (último quartil) da coluna 'valor_venda'
ultimo_quartil = df_produtos_semelhantes['similaridade'].quantile(0.75)

# Filtrando os dados que estão no último quartil
df_produtos_semelhantes = df_produtos_semelhantes[df_produtos_semelhantes['similaridade'] >= ultimo_quartil]

In [23]:
print(ultimo_quartil)

0.0


In [24]:
display(df_produtos_semelhantes)

Unnamed: 0,sku,sku_casado,similaridade
370,9M294TUTK,F8ZW4347R,0.707107
490,FZ7P6U3PP,HRPKRPNYD,0.707107
409,HRPKRPNYD,FZ7P6U3PP,0.707107
181,F8ZW4347R,9M294TUTK,0.707107
747,JWDBL2NB2,THDCJT4YR,0.632456
...,...,...,...
300,LSRE3YWF5,E7SLSQLCC,0.000000
301,M7ZQ5L9JL,E7SLSQLCC,0.000000
302,P7HXNQNUN,E7SLSQLCC,0.000000
303,PPW5U8MVC,E7SLSQLCC,0.000000


### Resultados

#### Produtos comprados em conjunto:

In [25]:
#Produto e descrição
produtos = pedidos_df[['sku', 'produto']].drop_duplicates(subset=['sku', 'produto'])


produtos.head()

Unnamed: 0,sku,produto
0,DFREFSP8P,BRACELETE AJUSTÁVEL O COM COROA
8,M7ZQ5L9JL,BRACELETE AJUSTÁVEL O SIGNATURE
9,JRM3YWN95,BRINCO ARGOLA SOL E LUA
11,TMXCUUW38,BRINCO PINGENTE SIGNATURE
13,74F6CQWEW,BRINCO PEQUENO SIGNATURE


In [26]:
df_resultado_prod_conj = pd.merge(df_resultado, produtos, on='sku', how='left')

df_resultado_prod_conj = df_resultado_prod_conj.merge(produtos, left_on='sku_casado', right_on='sku', how='left')

df_resultado_prod_conj = df_resultado_prod_conj[['sku_x', 'sku_casado', 'contagem', 'produto_x','produto_y']].rename(columns={
    'sku_x': 'sku',
    'sku_casado': 'sku_casado',
    'contagem': 'contagem',
    'produto_x': 'produto',
    'produto_y': 'produto_casado'
})

df_resultado_prod_conj.head()

Unnamed: 0,sku,sku_casado,contagem,produto,produto_casado
0,LSRE3YWF5,84WFBE5GN,2,BRINCO DE ARGOLA PINGENTE CORAÇÃO,PINGENTE LIFE INFINITO FOREVER
1,84WFBE5GN,HL3J67375,2,PINGENTE LIFE INFINITO FOREVER,BRINCO LIVRE PARA AMAR PEQUENO
2,74F6CQWEW,TMXCUUW38,2,BRINCO PEQUENO SIGNATURE,BRINCO PINGENTE SIGNATURE
3,74F6CQWEW,2XZUH896Q,2,BRINCO PEQUENO SIGNATURE,ARGOLA CRAVEJADA COM TARRAXA
4,THDCJT4YR,JWDBL2NB2,2,BRACELETE MALEÁVEL 18,CHARM PROTEÇÃO


#### Produtos Semelhantes

In [27]:
df_resultado_prod_semel = pd.merge(df_produtos_semelhantes, produtos, on='sku', how='left')

df_resultado_prod_semel = df_resultado_prod_semel.merge(produtos, left_on='sku_casado', right_on='sku', how='left')

df_resultado_prod_semel = df_resultado_prod_semel[['sku_x', 'sku_casado', 'similaridade', 'produto_x','produto_y']].rename(columns={
    'sku_x': 'sku',
    'sku_casado': 'sku_casado',
    'similaridade': 'similaridade',
    'produto_x': 'produto',
    'produto_y': 'produto_casado'
})

df_resultado_prod_semel.head()

Unnamed: 0,sku,sku_casado,similaridade,produto,produto_casado
0,9M294TUTK,F8ZW4347R,0.707107,BRACELETE MALEÁVEL 19,CHARM DIMOND
1,FZ7P6U3PP,HRPKRPNYD,0.707107,CHARM PENDENTE DIVISÍVEL CORAÇÃO DA AMIZADE,CHARM MOM
2,HRPKRPNYD,FZ7P6U3PP,0.707107,CHARM MOM,CHARM PENDENTE DIVISÍVEL CORAÇÃO DA AMIZADE
3,F8ZW4347R,9M294TUTK,0.707107,CHARM DIMOND,BRACELETE MALEÁVEL 19
4,JWDBL2NB2,THDCJT4YR,0.632456,CHARM PROTEÇÃO,BRACELETE MALEÁVEL 18
