In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df_trans_sample = pd.read_pickle('pickles/transactions.pkl')

#Sélection de samples des fichiers
#df_trans_sample = df_trans.sample(frac=0.05, random_state=123)


In [2]:
df_trans_sample = df_trans_sample[['t_dat', 'customer_id', 'article_id']]

In [3]:
days_to_last = (df_trans_sample['t_dat'].max() - df_trans_sample['t_dat']).dt.days
df_trans_sample['week_number'] = np.floor(days_to_last / 7).astype(int)

In [4]:
# Compter le nombre de transactions par semaine
sales_per_w = df_trans_sample.drop('customer_id', axis= 1).groupby(['week_number', 'article_id']).count()
sales_per_w = sales_per_w.rename(columns={'t_dat':'count'})
sales_per_w.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
week_number,article_id,Unnamed: 2_level_1
0,108775044,2
0,111565001,25
0,111586001,39
0,111593001,31
0,111609001,12


### Soumission des 12 articles les plus populaires sur l'ensemble du dataset

In [5]:
# Tri (décroissant) des articles par nombre de ventes 
best_sales = sales_per_w.groupby('article_id').count().sort_values('count', ascending=False)
best_sales = best_sales.reset_index(level=['article_id'])
best_sales

Unnamed: 0,article_id,count
0,0473954008,105
1,0554479001,105
2,0294008005,105
3,0294008002,105
4,0337991001,105
...,...,...
104542,0381038032,1
104543,0609947001,1
104544,0552743031,1
104545,0552743020,1


In [9]:
#Création d'une liste des meilleurs articles (format nécessaire pour submission)
best_articles_list = best_sales['article_id'].head(12).astype(str).to_list()
best_articles_list

['473954008',
 '554479001',
 '294008005',
 '294008002',
 '337991001',
 '400285006',
 '568597006',
 '492897001',
 '598755002',
 '598755001',
 '469039019',
 '568597007']

In [10]:
# Création d'une chaine de caractères (format nécessaire pour submission)
best_articles_string = ' '.join(best_articles_list)
best_articles_string

'473954008 554479001 294008005 294008002 337991001 400285006 568597006 492897001 598755002 598755001 469039019 568597007'

In [14]:
df_sub = pd.read_csv('sample_submission.csv')
df_sub['prediction'] = best_articles_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['prediction'] = best_articles_string


In [15]:
df_sub.to_csv("submission_12_most_popular_on_dataset.csv", index = False)
df_sub.shape

(1371980, 2)

In [18]:
#Commmentaire
## @Jean, @Julien: changer le code pour inclure toute la base de données (je ne peux pas le faire --> ordi pas assez puissant) et soumettez le kaggle

### Soumission des 12 articles les plus populaires sur la dernière semaine du dataset

In [6]:
# Tri (décroissant) des articles par nombre de ventes 
last_week_sales = df_trans_sample[df_trans_sample['week_number'] == 0]
last_week_sales

Unnamed: 0,t_dat,customer_id,article_id,week_number
13550886,2020-09-16,7853fcaaad1e91c729ae9f4f31882f413ee9618cbe3077...,0871527003,0
3209980,2020-09-16,1c68ff81ce18c176f6d5133da4e706308cb0297d6e6648...,0922037002,0
6156333,2020-09-16,36823912d53c2116d564fc0ea05a086b419eda1b1fcbab...,0536139067,0
1401809,2020-09-16,0c62dfa1f9913e2f0b761c95fa9adf00524523314983ab...,0810557013,0
8713058,2020-09-16,4d82a3ebf1d3810af5b23f6719f9118f28d8d439b15c0d...,0893432002,0
...,...,...,...,...
811613,2020-09-22,071f973932060cf55286b1b96b26b3bba1886df7ead999...,0792515001,0
21106060,2020-09-22,bb57de0fdce35adcbb7e40a00c9f83d2509bcf494df737...,0938804001,0
19072806,2020-09-22,a9297e9d1cc7a2fce5ab805360ba91f5dbe5ac564aea08...,0736923010,0
17367798,2020-09-22,9a0b86bdb4e902d1fd0ac920e8a6a12793138672bf764e...,0568597006,0


In [7]:
best_sales = last_week_sales[['article_id']].groupby('article_id').agg(count = ('article_id', 'count'))
best_sales = best_sales.nlargest(n = 12, columns = 'count')
best_sales

Unnamed: 0_level_0,count
article_id,Unnamed: 1_level_1
924243001,779
918522001,581
924243002,546
923758001,528
866731001,487
909370001,478
915529003,461
751471001,434
915529005,433
762846027,416


In [11]:
#Création d'une liste des meilleurs articles (format nécessaire pour submission)
best_articles_list = best_sales.index.astype(str).to_list()
best_articles_string = ' '.join(best_articles_list)
df_sub = pd.read_csv('sample_submission.csv')
df_sub['prediction'] = best_articles_string
df_sub.to_csv("submission_12_most_popular_on_last_week.csv", index = False)
df_sub.shape

(1371980, 2)

In [12]:
best_articles_string

'0924243001 0918522001 0924243002 0923758001 0866731001 0909370001 0915529003 0751471001 0915529005 0762846027 0918292001 0448509014'

In [16]:
customers = pd.read_pickle('pickles/customers.pkl')
customers.head()


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,average_cart_articles,average_cart_price,total_carts,total_articles,total_price,average_cart_interval
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,1.9,0.054393,10.0,19.0,0.543932,68.666667
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,3.25,0.10051,24.0,78.0,2.412237,29.818182
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,2.142857,0.086646,7.0,15.0,0.606525,121.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,2.0,0.060983,1.0,2.0,0.060983,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,2.166667,0.078282,6.0,13.0,0.469695,134.0


In [18]:
customers = customers.nsmallest(n = 136228, columns=['total_carts'])

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,average_cart_articles,average_cart_price,total_carts,total_articles,total_price,average_cart_interval
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,2.0,0.060983,1.0,2.0,0.060983,0.0
5,000064249685c11552da43ef22a5030f35a147f723d5b0...,0.0,0.0,NON ACTIVE,NONE,NONE,2c29ae653a9282cce4151bd87643c907644e09541abc28...,3.0,0.101644,1.0,3.0,0.101644,0.0
8,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,2.0,0.053356,1.0,2.0,0.053356,0.0
9,00008469a21b50b3d147c97135e25b4201a8c58997f787...,0.0,0.0,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,4.0,0.078068,1.0,4.0,0.078068,0.0
11,000097d91384a0c14893c09ed047a963c4fc6a5c021044...,0.0,0.0,ACTIVE,NONE,31.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,15.0,0.222492,1.0,15.0,0.222492,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
418328,4e38a56619f7a5680e404234575fba8bf3440145f32b53...,0.0,0.0,NON ACTIVE,NONE,32.0,cfac8dd49d54fca22d2648aa569c89fcdc0286c101cd10...,8.0,0.223593,1.0,8.0,0.223593,0.0
418332,4e38e32993e122c96b7453deec37e9c05f2a8be5f54c6c...,0.0,0.0,ACTIVE,NONE,20.0,bd47e22a292abbe9a8d3325f05a1831cb51bce2417c2a6...,2.0,0.042339,1.0,2.0,0.042339,0.0
418334,4e38fa491a09318f8bcc8c0642805a308d785450a0deb5...,0.0,0.0,ACTIVE,NONE,44.0,22bd4b81366809b0a6a9241fa256a6856a6aa7a705055a...,5.0,0.072797,1.0,5.0,0.072797,0.0
418343,4e393ca2541503d6070ce32a877799b63bd1a258e2e5ca...,0.0,0.0,ACTIVE,NONE,22.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...,2.0,0.054203,1.0,2.0,0.054203,0.0


In [21]:
customers = customers[['customer_id']]
df_sub = df_sub.drop(['prediction'], axis = 1)
customers['prediction'] = best_articles_string
df_sub = df_sub.merge(customers, on = 'customer_id', how = 'left')

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0918522001 0924243002 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,


In [24]:
df_sub = df_sub.fillna('')
df_sub['prediction'].value_counts()

                                                                                                                                       1235752
0924243001 0918522001 0924243002 0923758001 0866731001 0909370001 0915529003 0751471001 0915529005 0762846027 0918292001 0448509014     136228
Name: prediction, dtype: int64

In [26]:
df_sub.to_csv("submission_12_most_popular_partial.csv", index = False)