In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df_trans = pd.read_csv('/content/drive/MyDrive/transactions_train.csv', 
                       usecols = ['t_dat', 'customer_id', 'article_id'],
                       dtype={'article_id': str})
df_art = pd.read_csv('/content/drive/MyDrive/articles.csv')
df_cust = pd.read_csv('/content/drive/MyDrive/customers.csv')

#Sélection de samples des fichiers
df_trans_sample = df_trans.sample(frac=0.05, random_state=123)
df_art_sample = df_art.sample(frac=0.05, random_state=123)
df_cust_sample = df_cust.sample(frac=0.05, random_state=123)

In [2]:
df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003
...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012


In [3]:
df_trans_sample['t_dat'] = pd.to_datetime(df_trans_sample['t_dat'])

# Choisir le dernier timestamp
last_ts = df_trans_sample['t_dat'].max()

In [4]:
#Ajouter le dernier jour de la "billing week"
df_trans_sample['ldbw'] = df_trans_sample['t_dat'].apply(lambda d: last_ts - (last_ts - d).floor('7D'))

# Compter le nombre de transactions par semaine
weekly_sales = df_trans_sample.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
df_trans_sample = df_trans_sample.join(weekly_sales, on=['ldbw', 'article_id'])

df_trans_sample


Unnamed: 0,t_dat,customer_id,article_id,ldbw,count
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21,1
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10,6
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17,3
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13,10
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28,1
...,...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23,3
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23,3
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10,2
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16,5


In [5]:
# On part de l'hypothèse que les ventes de la semaine 'target' seront similaires aux ventes de la dernière 
# semaine du data training

weekly_sales = weekly_sales.reset_index().set_index('article_id')
last_day = last_ts.strftime('%Y-%m-%d')

df_trans_sample = df_trans_sample.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_day, ['count']],
    on='article_id', rsuffix="_targ")  # À éclaircir 

df_trans_sample['count_targ'].fillna(0, inplace=True)
del weekly_sales

df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count,count_targ
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21,1,0.0
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10,6,6.0
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17,3,0.0
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13,10,0.0
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28,1,0.0
...,...,...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23,3,0.0
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23,3,0.0
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10,2,0.0
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16,5,0.0


In [17]:
# Calcul du quotient de ventes durant la derniere semaine (semaine du 22 /09) 
## sur les ventes / semaine pour chaque article (note: je travaille sur un sample -> donc pas tous les articles_id)

df_trans_sample['quotient'] = df_trans_sample['count_targ'] / df_trans_sample['count']
df_trans_sample[df_trans_sample['t_dat']=='2018-09-20']

Unnamed: 0,t_dat,customer_id,article_id,ldbw,count,count_targ,quotient
19510,2018-09-20,65b94c6d0f6def8ef692483722c347bc21a9f50453bf21...,0607663001,2018-09-25,1,0.0,0.0
7012,2018-09-20,260de395f367962f10095eda4d5a7327cc96ce54661307...,0552471003,2018-09-25,4,0.0,0.0
33033,2018-09-20,ace9986a4579ad3b3d952c97fd1e54da0e8c62fa3e9d50...,0673630004,2018-09-25,1,0.0,0.0
20417,2018-09-20,6a3b15af6d81aed32a59b27ca812380f5c3ce53d5c37a8...,0680810002,2018-09-25,3,0.0,0.0
17588,2018-09-20,5c534c074b3d0b04105d54a2603ad560eca43b5901bff3...,0551336008,2018-09-25,2,0.0,0.0
...,...,...,...,...,...,...,...
20260,2018-09-20,69a6668405e8dfaa42750eba59d18af47ea38c9efaec0d...,0611415005,2018-09-25,12,0.0,0.0
35561,2018-09-20,bb4675763bfa5290719de3475d004bf17e267ca5be34a6...,0500780008,2018-09-25,1,0.0,0.0
27872,2018-09-20,91c7e6c043dfad273b4dd82930c8b2d4462ad18b76ab0a...,0630316009,2018-09-25,2,0.0,0.0
30335,2018-09-20,9e78417360f6f9070f086ed1b40e55287ff9240e77cd9c...,0673643001,2018-09-25,4,0.0,0.0


In [22]:
#Choix des 12 produits les plus populaires
N = 12
target_sales = df_trans_sample.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

general_pred

['0448509014',
 '0573085028',
 '0706016001',
 '0751471001',
 '0673677002',
 '0715624001',
 '0706016002',
 '0685814001',
 '0678942001',
 '0706016003',
 '0611415001',
 '0568601006']

In [24]:
# Créer une dictionnaire achats    # À éclaircir 
from tqdm import tqdm
tqdm.pandas()

purchase_dict = {}
for i in tqdm(df_trans_sample.index):
    cust_id = df_trans_sample.at[i, 'customer_id']
    art_id = df_trans_sample.at[i, 'article_id']
    t_dat = df_trans_sample.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d

    value = df_trans_sample.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

100%|██████████| 1589416/1589416 [01:29<00:00, 17673.22it/s]
