# Articles achetés ensemble
Dans ce notebook, nous cherchons à identifier les articles qui sont fréquemment achetés l'un après l'autre : les cas où quand un client achète l'article A, il y a une probabilité élevée qu'il achète l'article B.

In [1]:
import pandas as pd
import numpy as np
import pickle
import swifter
import gc

In [2]:
# Création d'un index pour les IDs d'article.
articles = pd.read_pickle('pickles/articles_clean.pkl')
articles = articles[['article_id']].reset_index()

In [3]:
# Chargement des transactions et optimisation.
transactions = pd.read_pickle('pickles/transactions_clean.pkl')
transactions = transactions.merge(articles, on = 'article_id', how = 'left').drop(columns = 'article_id', axis = 1)

In [4]:
# Réduction à la moitié des utilisateurs.
customers = transactions['customer_id'].unique()
np.random.shuffle(customers)

customers = customers[0: len(customers) // 4]

transactions = transactions[transactions['customer_id'].isin(customers)]

In [5]:
transactions = transactions.groupby(['customer_id', 'index'], as_index = False, sort = False).agg(
    sales = ('index', 'count'),
    week_number = ('week_number', 'max')
).sort_values('week_number', ascending = False)

In [6]:
sales = transactions.groupby('index', as_index = False, sort = False).agg(
    sales = ('customer_id', 'count')
)

In [7]:
purchase_lists = transactions.groupby(['customer_id'], as_index = False, sort = False).agg(
    purchases = ('index', lambda x: list(x)),
    weeks = ('week_number', lambda x: list(x))
)

In [8]:
pairs_dictionnary = {}

for i in range(len(articles)):
    pairs_dictionnary[i] = {}

total_lists = len(purchase_lists)
current_line = 0


def process_purchase_list(purchase_list):
    global total_lists
    global current_line

    #print(f"\r Insertion des paires pour la ligne {current_line} / {total_lists}", end="")
    
    length = len(purchase_list)
    
    for i in range(0, length):
        for j in range(i + 1, length):
                
            article_source = purchase_list[i]
            article_dest = purchase_list[j]
            
            if article_dest not in pairs_dictionnary[article_source]:
                pairs_dictionnary[article_source][article_dest] = 0
                
            pairs_dictionnary[article_source][article_dest] += 1
            
    current_line += 1
            
blank = purchase_lists['purchases'].swifter.apply(process_purchase_list)

Pandas Apply: 100%|██████████| 340570/340570 [01:16<00:00, 4425.02it/s] 


In [9]:
sales_by_index = articles[['index']].merge(sales, on = 'index', how = 'left').fillna(0)['sales'].values

In [18]:
sales_by_index

array([1.698e+03, 1.376e+03, 5.100e+01, ..., 1.000e+00, 6.000e+00,
       7.000e+00])

In [23]:
### Mise au format liste pour insertion dans un DataFrame

pairs_dataframes = []
articles_by_chunks = 10000
current_index = 0

# Traitement par paquets de 10 000 
while current_index < len(pairs_dictionnary):
    pairs_list = []
    # Boucle sur les articles sources correspondants au paquet
    for article_source in range(current_index, min(current_index + articles_by_chunks, len(pairs_dictionnary))):
        
        print(f"\rProcessing article {article_source}", end="")
        
        for article_dest in pairs_dictionnary[article_source].keys():
            ratio = pairs_dictionnary[article_source][article_dest] / sales_by_index[article_source]
            
            if(ratio < 0.05):
                continue 
            
            pairs_list.append([
                article_source,
                article_dest,
                ratio
            ])
            
    # Création d'un dataframe
    pairs = pd.DataFrame(pairs_list, columns = ['index_source', 'index_dest', 'ratio'])
    pairs['index_source'] = pairs['index_source'].astype('int32')
    pairs['index_dest'] = pairs['index_dest'].astype('int32')
    pairs['ratio'] = pairs['ratio'].astype('float16')
    
    pairs_dataframes.append(pairs)
    
    current_index += articles_by_chunks

Processing article 104546

In [24]:
pairs = pd.concat(pairs_dataframes)

In [29]:
pairs.to_pickle('pickles/articles_pairs.pkl')
pairs.head()

Unnamed: 0,index_source,index_dest,ratio
0,0,53832,0.058899
1,0,1711,0.065369
2,0,1,0.137207
3,0,72378,0.060669
4,0,24808,0.050049


### Création des listings par Client
#### Pour soumission Kaggle

In [50]:
### CHECKPOINT
import pandas as pd
import gc

articles = pd.read_pickle('pickles/articles_clean.pkl')
articles = articles[['article_id']].reset_index()

pairs = pd.read_pickle('pickles/articles_pairs.pkl')

In [51]:
# Chargement de la liste des transactions 
transactions = pd.read_pickle('pickles/transactions_clean.pkl')
transactions = transactions[transactions['week_number'] < 54]

transactions = transactions.merge(articles, on = 'article_id', how = 'left').drop(columns = 'article_id', axis = 1)

transactions = transactions.groupby(['customer_id', 'index'], as_index = False).agg(
    week_number = ('week_number', 'min')
)

In [57]:
### Élaboration des listes par paquets

customer_ids = transactions['customer_id'].unique()

current_index = 0
customers_by_chunks = 20000

dataframes = []

while current_index < len(customer_ids):
    print(f"\rTraitement des utilisateur {current_index} - {current_index + customers_by_chunks}", end = "")
    
    # Sélection d'un nombre restreint d'utilisateurs
    customers_in_chunk = customer_ids[current_index: current_index + customers_by_chunks]
    transactions_in_chunk = transactions[transactions['customer_id'].isin(customers_in_chunk)]

    # Détermination des paires.
    pairs_in_chunk = transactions_in_chunk.merge(pairs, left_on='index', right_on = 'index_source', how='inner')
    
    # Suppression des lignes où l'article appairé a déjà été acheté.
    pairs_in_chunk = pairs_in_chunk.merge(transactions_in_chunk, left_on=['customer_id', 'index_dest'], right_on=['customer_id', 'index'], how = 'left', suffixes = ('__source', '__dest'))

    pairs_in_chunk['week_number__dest'].fillna(100, inplace = True)

    pairs_in_chunk = pairs_in_chunk[pairs_in_chunk['week_number__source'] < pairs_in_chunk['week_number__dest']]
    
    # Ajout de l'ID article originel.
    pairs_in_chunk = pairs_in_chunk.merge(articles, left_on = 'index_dest', right_on = 'index')
    
    pairs_in_chunk = pairs_in_chunk.sort_values('ratio', ascending = False)
    
    # Finalisation de la liste.
    pairs_list = pairs_in_chunk.groupby('customer_id', as_index = False, sort = False).agg(
        list = ('article_id', lambda x: list(x))
    )
    pairs_list['list'] = pairs_list['list'].apply(lambda x: x[0:101])
    
    dataframes.append(pairs_list)
    
    # Passage au paquet suivant.
    current_index += customers_by_chunks


Traitement des utilisateur 1000000 - 1020000

In [58]:
pairs_list = pd.concat(dataframes)

In [59]:
pairs_list.to_pickle("pickles/articles_pairs_list.pkl")
pairs_list.head()

Unnamed: 0,customer_id,list
0,02054367e574c7287a453fb3ae3174b8d5fa968ab1c418...,"[0832482001, 0820484001, 0837351001, 083441201..."
1,04e2301dc0581c5b443429e1614cf21ffe9b23d8d3fe05...,"[0706104010, 0783969004, 0834799001, 068581600..."
2,0456515f18cdd341dbb0077191bcd67e9e7738d81ed4f9...,"[0847959002, 0803592003, 0798827001, 077290200..."
3,0444412a01c8d761fa55b19bd29cf07bd80bfcbe82275a...,"[0568601029, 0448515033, 0671444001, 079621000..."
4,00336f6224d49f63d4c1f18315f0e82789f31be24506fa...,"[0489435015, 0448509014, 0850259001, 064002101..."


#### Soumission Kaggle

In [64]:
pairs_list['prediction'] = pairs_list['list'].apply(lambda x: ' '.join(x[0:12]))

customers = pd.read_pickle('pickles/customers_clean.pkl')

submission = customers[['customer_id']].merge(pairs_list[['customer_id', 'prediction']], on = "customer_id", how = 'left').fillna('')

submission.to_csv('submissions/submission_items_purchased_together.csv', index = False)
submission

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0805000001 0877278002 0673396002 0825579001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0805406003 0573085042 0542402001 0792469001 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0727808002 0858883001 0699080001 0685813005 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0732206001 0896152001 0791587001 0927530006 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0822311003 0872820004 0827957001 0822311012 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0813406005 0470789019 0892558004 0918292001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0835561002 0795975003 0706016003 05...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0817166001


: 

**Score MAP@12 de la soumission Kaggle : 0.00361**

#### Pour entraînement
On enlève alors les données de la dernière semaine, et on n'élabore la liste que pour les clients qui ont des transactions à la dernière semaine.

En principe il faudrait aussi regénérer entièrement la liste des paires. Nous avons choisi de ne pas le faire à ce stade, pour des raisons de temps de calcul.

In [None]:
# Chargement de la liste des transactions 
transactions = pd.read_pickle('pickles/transactions_clean.pkl')

customer_ids = transactions[transactions['week_number'] == 0]['customer_id'].unique()
transactions = transactions[(transactions['week_number'] < 54) & (transactions['week_number'] > 0) & (transactions['customer_id'].isin(customer_ids))]

transactions = transactions.merge(articles, on = 'article_id', how = 'left').drop(columns = 'article_id', axis = 1)

transactions = transactions.groupby(['customer_id', 'index'], as_index = False).agg(
    week_number = ('week_number', 'min')
)

In [47]:
pairs = pd.read_pickle('pickles/articles_pairs.pkl')

# Détermination des paires.
pairs = transactions.merge(pairs, left_on='index', right_on = 'index_source', how='inner')

# Suppression des lignes où l'article appairé a déjà été acheté.
pairs = pairs.merge(transactions, left_on=['customer_id', 'index_dest'], right_on=['customer_id', 'index'], how = 'left', suffixes = ('__source', '__dest'))

pairs['week_number__dest'].fillna(100, inplace = True)

pairs = pairs[pairs['week_number__source'] < pairs['week_number__dest']]

# Ajout de l'ID article originel.
pairs = pairs.merge(articles, left_on = 'index_dest', right_on = 'index')

# Finalisation de la liste.
pairs_list = pairs.groupby('customer_id', as_index = False, sort = False).agg(
    list = ('article_id', lambda x: list(x))
)
pairs_list['list'] = pairs_list['list'].apply(lambda x: x[0:101])
    


In [48]:
pairs_list.to_pickle("pickles/articles_pairs_list_training.pkl")
pairs_list.head()

Unnamed: 0,customer_id,list
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,"[0111593001, 0111593001, 0123173001, 011158600..."
1,000fb6e772c5d0023892065e659963da90b1866035558e...,"[0111593001, 0111593001, 0111593001, 011159300..."
2,0073569a706784581f7916cbc61e6af44c9fa52eae38e4...,"[0111593001, 0111593001, 0123173001, 012317300..."
3,0127e534d8e740d7edfd0a32147a5606b82c9fd2bcf043...,"[0111593001, 0123173001, 0111586001, 015834000..."
4,017aea02140c5a2efed108966b7c9dca8719caf028e924...,"[0111593001, 0111593001, 0111586001, 011158600..."
