In [1]:
import pandas as pd
import numpy as np
import pickle

In [3]:
transactions = pd.read_pickle('pickles/transactions_clean.pkl')

In [3]:
transactions = transactions[transactions['week_number'] < 20]

In [4]:
transactions['article_id'] = transactions['article_id'].astype("int32")

In [5]:
transactions = transactions.groupby(['customer_id', 'article_id'], as_index = False).agg(
    sales = ('article_id', 'count'),
    week_number = ('week_number', 'max')
).sort_values('week_number', ascending = False)

In [6]:
sales = transactions.groupby('article_id', as_index = False).agg(
    sales = ('sales', 'sum')
)

In [23]:
purchase_lists = transactions.groupby(['customer_id'], as_index = False).agg(
    purchases = ('article_id', lambda x: list(x)),
    weeks = ('week_number', lambda x: list(x))
)

In [24]:
pairs_dictionnary = {}

total_lists = len(purchase_lists)
current_line = 0

def process_purchase_list(row):
    global total_lists
    global current_line

    print(f"\r Insertion des paires pour la ligne {current_line} / {total_lists}", end="")
    
    length = len(row['purchases'])
    
    for i in range(0, length):
        for j in range(i + 1, length):
            
            interval = row['weeks'][i] - row['weeks'][j] # Les n° de semaines sont inversés et partent de la fin du dataset.
            
            if interval  == 0:
                continue
                
            article_source = row['purchases'][i]
            article_dest = row['purchases'][j]
            
            if article_source not in pairs_dictionnary:
                pairs_dictionnary[article_source] = {}
                
            if article_dest not in pairs_dictionnary[article_source]:
                pairs_dictionnary[article_source][article_dest] = [0, 0]
                
            pairs_dictionnary[article_source][article_dest][0] += 1
            pairs_dictionnary[article_source][article_dest][1] += interval
            
    current_line += 1
            
blank = purchase_lists.apply(process_purchase_list, axis = 1)

 Insertion des paires pour la ligne 659007 / 659008

In [25]:
# Mise au format liste pour insertion dans un DataFrame
pairs_list = []

for article_source in pairs_dictionnary.keys():
    for article_dest in pairs_dictionnary[article_source].keys():
        pairs_list.append([
            article_source,
            article_dest,
            pairs_dictionnary[article_source][article_dest][0],
            pairs_dictionnary[article_source][article_dest][1]
        ])

In [26]:
pickle.dump(pairs_list, open("pickles/articles_pairs_list.pkl", 'wb'))

In [7]:

pairs_list = pickle.load(open("pickles/articles_pairs_list.pkl", 'rb'))

In [23]:
# Création du DataFrame
pairs = pd.DataFrame(pairs_list, columns = ['article_id', 'article_dest', 'count', 'intervals'])

In [24]:
pairs = pairs.merge(sales, on = 'article_id', how = 'left')

In [25]:
# Ne garder que les paires avec un nombre significatif de ventes.
pairs = pairs[pairs['sales'] > 5]

In [26]:
# Finalisation
pairs['mean_interval'] = pairs['intervals'] / pairs['count']
pairs['ratio'] = pairs['count'] / pairs['sales']
pairs = pairs[pairs['ratio'] > 0.05]
pairs.drop(columns = ['intervals', 'count', 'sales'], axis = 1, inplace = True)

In [31]:
pairs.to_pickle('pickles/article_pairs.pkl')

In [29]:
pairs

Unnamed: 0,article_id,article_dest,mean_interval,ratio
22843,903428001,921226007,2.0,0.062500
22844,903428001,889652001,2.0,0.062500
22845,903428001,568597007,2.0,0.062500
22846,903428001,516859008,2.0,0.062500
22847,903428001,562245099,2.0,0.062500
...,...,...,...,...
28763518,888140002,608776020,3.0,0.166667
28763519,888140002,892624003,3.0,0.166667
28763884,697920067,685813042,1.0,0.083333
28764296,679948010,782555003,5.0,0.142857


In [4]:
pairs.describe()

Unnamed: 0,article_id,article_dest,mean_interval,ratio
count,873146.0,873146.0,873146.0,873146.0
mean,748410200.0,798558800.0,6.684284,0.091234
std,114526000.0,121195600.0,4.43824,0.034677
min,110065000.0,108775000.0,1.0,0.050209
25%,701265000.0,757971000.0,3.0,0.0625
50%,772762000.0,834898000.0,6.0,0.083333
75%,825063000.0,875767000.0,10.0,0.111111
max,956217000.0,956217000.0,19.0,0.5


### Création des listings par Client

In [1]:
### CHECKPOINT
import pandas as pd

pairs = pd.read_pickle('pickles/article_pairs.pkl')

In [2]:
transactions = pd.read_pickle('pickles/transactions.pkl')
transactions = transactions[transactions['week_number'] < 27]

In [3]:
transactions = transactions.groupby(['customer_id', 'article_id'], as_index = False).agg(
    week_number = ('week_number', 'min')
)

In [4]:
transactions['article_id'] = transactions['article_id'].astype('int32')

In [5]:
# Détermination des paires.
pairs = transactions.merge(pairs, on='article_id', how='inner')


In [6]:
# Suppression des lignes où l'article appairé a déjà été acheté.
pairs = pairs.merge(transactions, left_on=['customer_id', 'article_dest'], right_on=['customer_id', 'article_id'], how = 'left', suffixes = ('_source', '_dest'))

pairs['week_number_dest'].fillna(100, inplace = True)


pairs = pairs[pairs['week_number_source'] > pairs['week_number_dest']]
pairs['article_dest'] = '0' + pairs['article_dest'].astype(str)

In [9]:
# Finalisation des listes.
pairs_list = pairs.groupby('customer_id', as_index = False, sort = False).agg(
    list = ('article_dest', lambda x: list(x))
)
pairs_list['list'] = pairs_list['list'].apply(lambda x: x[0:51])

In [10]:
pairs_list.to_pickle("pickles/articles_pairs_list.pkl")