In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df_trans = pd.read_csv('/content/drive/MyDrive/transactions_train.csv', 
                       usecols = ['t_dat', 'customer_id', 'article_id'],
                       dtype={'article_id': str})

#Sélection de samples des fichiers
df_trans_sample = df_trans.sample(frac=0.05, random_state=123)


In [2]:
df_trans_sample.head()

Unnamed: 0,t_dat,customer_id,article_id
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,707100001
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,568601007
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,750423002
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,664319014
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,763285003


In [4]:
df_trans_sample['t_dat'] = pd.to_datetime(df_trans_sample['t_dat'])

# Choisir le dernier timestamp
last_ts = df_trans_sample['t_dat'].max()
last_ts

Timestamp('2020-09-22 00:00:00')

In [5]:
# Ajouter une colonne 'last day of the billing week"

df_trans_sample['last_day_billing_w'] = df_trans_sample['t_dat'].apply(lambda x: last_ts - (last_ts - x).floor('7D')) 
df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28
...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16


In [6]:
# Compter le nombre de transactions par semaine
sales_per_w = df_trans_sample.drop('customer_id', axis= 1).groupby(['last_day_billing_w', 'article_id']).count()
sales_per_w = sales_per_w.rename(columns={'t_dat':'count'})
sales_per_w.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
last_day_billing_w,article_id,Unnamed: 2_level_1
2018-09-25,108775015,14
2018-09-25,108775044,4
2018-09-25,108775051,1
2018-09-25,110065001,1
2018-09-25,110065011,3


In [7]:
# Tri (décroissant) des articles par nombre de ventes 
best_sales = sales_per_w.groupby('article_id').count().sort_values('count', ascending=False)
best_sales = best_sales.reset_index(level=['article_id'])
best_sales

Unnamed: 0,article_id,count
0,0610776002,105
1,0372860001,105
2,0372860002,104
3,0572797001,104
4,0562245046,104
...,...,...
78244,0747085001,1
78245,0747057006,1
78246,0747057001,1
78247,0747051001,1


In [9]:
#Création d'une liste des meilleurs articles (format nécessaire pour submission)
best_articles_list = best_sales['article_id'].head(12).astype(str).to_list()
best_articles_list

['0610776002',
 '0372860001',
 '0372860002',
 '0572797001',
 '0562245046',
 '0160442007',
 '0579541001',
 '0536139006',
 '0572797002',
 '0464297007',
 '0610776001',
 '0507909001']

In [13]:
# Création d'une chaine de caractères (format nécessaire pour submission)
best_articles_string = ' '.join(best_articles_list)
best_articles_string

'0610776002 0372860001 0372860002 0572797001 0562245046 0160442007 0579541001 0536139006 0572797002 0464297007 0610776001 0507909001'

In [17]:
# Soumission Kaggle -> mais uniquement sur la base d'un sample de 5%
df_sample_sub = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
df_sample_sub['prediction'] = best_articles_string 
df_sample_sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0610776002 0372860001 0372860002 0572797001 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0610776002 0372860001 0372860002 0572797001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0610776002 0372860001 0372860002 0572797001 05...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0610776002 0372860001 0372860002 0572797001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0610776002 0372860001 0372860002 0572797001 05...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0610776002 0372860001 0372860002 0572797001 05...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0610776002 0372860001 0372860002 0572797001 05...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0610776002 0372860001 0372860002 0572797001 05...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0610776002 0372860001 0372860002 0572797001 05...


In [18]:
#Commmentaire
## @Jean, @Julien: changer le code pour inclure toute la base de données (je ne peux pas le faire --> ordi pas assez puissant) et soumettez le kaggle