In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df_trans = pd.read_csv('/content/drive/MyDrive/transactions_train.csv', 
                       usecols = ['t_dat', 'customer_id', 'article_id'],
                       dtype={'article_id': str})
df_art = pd.read_csv('/content/drive/MyDrive/articles.csv')
df_cust = pd.read_csv('/content/drive/MyDrive/customers.csv')

#Sélection de samples des fichiers
df_trans_sample = df_trans.sample(frac=0.05, random_state=123)
df_art_sample = df_art.sample(frac=0.05, random_state=123)
df_cust_sample = df_cust.sample(frac=0.05, random_state=123)

In [3]:
df_trans_sample.head()

Unnamed: 0,t_dat,customer_id,article_id
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,707100001
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,568601007
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,750423002
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,664319014
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,763285003


In [4]:
df_art_sample.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
48623,692839003,692839,TP Visby 5-pkt,272,Trousers,Garment Lower body,1010016,Solid,19,Greenish Khaki,...,Kids Boy Trouser,H,Children Sizes 92-140,4,Baby/Children,46,Kids Boy,1009,Trousers,5-pocket stretch cotton twill trousers in a re...
101639,898192001,898192,Homa mule,89,Other shoe,Shoes,1010016,Solid,13,Beige,...,Heels,C,Ladies Accessories,1,Ladieswear,64,Womens Shoes,1020,Shoes,Mules in imitation leather with pointed toes. ...
102508,904357001,904357,Lupine dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,Jersey fancy,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1005,Jersey Fancy,"Short, fitted dress in jersey with a V-neck, w..."
99951,887681003,887681,Calypso 2p wide side thong,286,Underwear bottom,Underwear,1010016,Solid,9,Black,...,Expressive Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",Thong briefs in lace and microfibre with a low...
8366,522678003,522678,Art leather sneaker PQ,94,Sneakers,Shoes,1010016,Solid,10,White,...,Shoes,F,Menswear,3,Menswear,27,Men Shoes,1020,Shoes,"Trainers with lacing at the front, leather lin..."


In [5]:
df_trans_sample['t_dat'] = pd.to_datetime(df_trans_sample['t_dat'])

# Choisir le dernier timestamp
last_ts = df_trans_sample['t_dat'].max()
last_ts

Timestamp('2020-09-22 00:00:00')

In [6]:

# Ajouter une colonne 'last day of the billing week"

df_trans_sample['last_day_billing_w'] = df_trans_sample['t_dat'].apply(lambda x: last_ts - (last_ts - x).floor('7D')) # À éclaircir
df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28
...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16


In [7]:
# Compter le nombre de transactions par semaine
sales_per_w = df_trans_sample.drop('customer_id', axis= 1).groupby(['last_day_billing_w', 'article_id']).count()
sales_per_w = sales_per_w.rename(columns={'t_dat':'count'})
sales_per_w

Unnamed: 0_level_0,Unnamed: 1_level_0,count
last_day_billing_w,article_id,Unnamed: 2_level_1
2018-09-25,0108775015,14
2018-09-25,0108775044,4
2018-09-25,0108775051,1
2018-09-25,0110065001,1
2018-09-25,0110065011,3
...,...,...
2020-09-22,0946795001,1
2020-09-22,0947060001,1
2020-09-22,0949198001,8
2020-09-22,0949594001,1


In [8]:
#Jointure entre df trans_sample et df sales_per_w
df_trans_sample = df_trans_sample.join(sales_per_w, on=['last_day_billing_w', 'article_id'])
df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w,count
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21,1
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10,6
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17,3
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13,10
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28,1
...,...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23,3
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23,3
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10,2
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16,5


In [9]:
# On part de l'hypothèse que les ventes de la semaine 'target' seront similaires aux ventes de la dernière 
# semaine du data training 
 ## Mettre l'article id en index du df sales_per_w
sales_per_w = sales_per_w.reset_index().set_index('article_id')
sales_per_w
 ## Jointure entre df trans_sample et df répertoriant les ventes des art le jour du last day of billing week 
last_day = last_ts.strftime('%Y-%m-%d')
sales_per_w.loc[sales_per_w['last_day_billing_w']==last_day, ['count']]
df_trans_sample = df_trans_sample.join(sales_per_w.loc[sales_per_w['last_day_billing_w']==last_day, ['count']], 
                                       on='article_id', rsuffix='_targ')

df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w,count,count_targ
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21,1,
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10,6,6.0
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17,3,
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13,10,
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28,1,
...,...,...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23,3,
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23,3,
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10,2,
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16,5,


In [10]:
# Remplacement des NaN de la colonne 'count_targ' par 0

df_trans_sample['count_targ'].fillna(0, inplace=True)
df_trans_sample

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w,count,count_targ
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,0707100001,2019-05-21,1,0.0
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,0568601007,2020-03-10,6,6.0
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,0750423002,2019-09-17,3,0.0
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,0664319014,2019-08-13,10,0.0
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,0763285003,2019-05-28,1,0.0
...,...,...,...,...,...,...
14002052,2019-07-17,9bb1f2e9a240de5184ef2390559ae40e8e66c976e3b226...,0763284004,2019-07-23,3,0.0
14269524,2019-07-22,a1dc0a2a8dd36b20a5c48274cca508a6cfb3af317598f7...,0698324001,2019-07-23,3,0.0
23285471,2020-03-10,d72b95974c47945bac766eee18dcbe68797992c6a8a7d9...,0864380002,2020-03-10,2,0.0
13710986,2019-07-11,f2e860566e9688f132a05d83d7ebfd5751c9f3c7c57cb6...,0739144012,2019-07-16,5,0.0


In [11]:
# Création d'un quotient de ventes calculé de la manière suivante: ventes de la derniere semaine (semaine du 22 /09) 
# sur les ventes / semaine pour chaque article (note: je travaille sur un sample -> donc pas tous les articles_id)

df_trans_sample['quotient'] = df_trans_sample['count_targ'] / df_trans_sample['count']

#Aperçu du df avec colonnes pertinentes
df_trans_sample.loc[:,['t_dat', 'article_id', 'quotient']].head(25)

Unnamed: 0,t_dat,article_id,quotient
10370163,2019-05-16,707100001,0.0
23100032,2020-03-06,568601007,1.0
16663564,2019-09-16,750423002,0.0
15238141,2019-08-09,664319014,0.0
10863699,2019-05-27,763285003,0.0
13013763,2019-06-30,759323001,0.0
3946986,2018-12-20,453239037,0.0
116470,2018-09-23,574120001,0.0
2091115,2018-11-03,673426004,0.0
1681422,2018-10-25,666448004,0.0


In [12]:
df_trans_sample.head(30)

Unnamed: 0,t_dat,customer_id,article_id,last_day_billing_w,count,count_targ,quotient
10370163,2019-05-16,e4f9fb11b0a48dc8c619cfed65b180d49726397c0e15c5...,707100001,2019-05-21,1,0.0,0.0
23100032,2020-03-06,27342044b4092abc539b151b79c0bb06c1d8c2c489ee41...,568601007,2020-03-10,6,6.0,1.0
16663564,2019-09-16,2adc8a75429bccf2061e37f45f13d916d9d94faa36f538...,750423002,2019-09-17,3,0.0,0.0
15238141,2019-08-09,832c35ecfb3cbedc30cae594bfb0efcd944f376243b5f4...,664319014,2019-08-13,10,0.0,0.0
10863699,2019-05-27,010afb223ecd6785aa2944003f2166eb32873a2df40e7c...,763285003,2019-05-28,1,0.0,0.0
13013763,2019-06-30,44a8911f9995281b4c1348534fd4424297303b94231fa9...,759323001,2019-07-02,7,0.0,0.0
3946986,2018-12-20,87e44a2f6058a26b581d84558a1b6d1dc693d1a4cc3f81...,453239037,2018-12-25,3,0.0,0.0
116470,2018-09-23,0e9936f946e0e79672c73b5bf81f30dd360f1fb17881a7...,574120001,2018-09-25,1,0.0,0.0
2091115,2018-11-03,8ea2ae60d7e9d328d47420fa344289e2942110960b9dcf...,673426004,2018-11-06,2,0.0,0.0
1681422,2018-10-25,9dbbaafbfa26112e61266750add5f6f6baadfddb280609...,666448004,2018-10-30,6,0.0,0.0


In [13]:
#Choix des 12 produits les plus populaires sur la base de la somme des quotients calculés préalablement 
N = 12
target_sales = df_trans_sample.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

general_pred

['0448509014',
 '0573085028',
 '0706016001',
 '0751471001',
 '0673677002',
 '0715624001',
 '0706016002',
 '0685814001',
 '0678942001',
 '0706016003',
 '0611415001',
 '0568601006']

In [14]:
# Créer une dictionnaire achats    # À éclaircir 
from tqdm import tqdm
tqdm.pandas()

purchase_dict = {}
for i in tqdm(df_trans_sample.index):
    cust_id = df_trans_sample.at[i, 'customer_id']
    art_id = df_trans_sample.at[i, 'article_id']
    t_dat = df_trans_sample.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    # prise en compte de la dépendance de la proba (que le client achète le même produit) sur le nbre de jours écoulés
    x = max(1, (last_ts - t_dat).days)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d    

    value = df_trans_sample.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

100%|██████████| 1589416/1589416 [01:25<00:00, 18636.78it/s]


In [15]:
# Chargement du fichier sample
df_sample_sub = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
df_sample_sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [16]:
# Soumission Kaggle    À éclaircir   -> Jean, Julien : soumettez ce fichier sur le site Kaggle
pred_list = []
for cust_id in tqdm(df_sample_sub['customer_id']):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 0]
        l = series.nlargest(N).index.tolist()
        if len(l) < N:
            l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    pred_list.append(' '.join(l))



100%|██████████| 1371980/1371980 [08:40<00:00, 2637.58it/s]


In [17]:
df_sample_sub['prediction'] = pred_list        
df_sample_sub.to_csv('submission.csv', index=None)

In [18]:
df_sample_sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0448509014 0573085028 0706016001 0751471001 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0559630026 0448509014 0573085028 0706016001 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0448509014 0573085028 0706016001 0751471001 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0573085028 0706016001 0751471001 06...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0448509014 0573085028 0706016001 0751471001 06...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0448509014 0573085028 0706016001 0751471001 06...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0448509014 0573085028 0706016001 0751471001 06...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0448509014 0573085028 0706016001 0751471001 06...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0448509014 0573085028 0706016001 0751471001 06...


In [18]:
#Commmentaire
## @Jean, @Julien: changer le code pour inclure toute la base de données (je ne peux pas le faire --> ordi pas assez puissant) et soumettez le kaggle