In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
transactions = pd.read_pickle("pickles/transactions.pkl")

In [3]:
# Filtrage en ne sélectionnant que la dernière semaine
last_day = transactions['t_dat'].max()
transactions = transactions[(last_day - transactions['t_dat']).dt.days < 365]

In [40]:
# Récupération d'un DataFrame avec les items les plus populaires.
best_sales = transactions[['article_id']].groupby('article_id', as_index = False).agg(count = ('article_id', 'count'))
best_sales = best_sales.sort_values('count', ascending = False)
best_sales.head(12)




Unnamed: 0,article_id,count
53832,706016001,42672
53833,706016002,30862
1711,372860001,29337
24808,610776002,25234
70124,759871002,23799
1712,372860002,22472
3706,464297007,21782
2233,399223001,19604
58427,720125001,18975
24807,610776001,18777


In [41]:
# Formattage de la liste.
best_articles_list = best_sales['article_id'].head(12).astype(str).to_list()
best_articles_string = ' '.join(best_articles_list)

In [42]:
# Création du dataframe final
customers = pd.read_pickle("pickles/customers.pkl")
submission = customers[['customer_id']]
submission['prediction'] = best_articles_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['prediction'] = best_articles_string


In [43]:
submission.head()
submission.export_csv('submission_12_most_popular.csv')

AttributeError: 'DataFrame' object has no attribute 'export_csv'

### Ajout de la notion de tendance
#### On cherche à savoir si le produit est en hausse ou non

In [3]:
# Ajout d'un numéro de semaine aux transactions

last_day = transactions['t_dat'].max()
transactions['week_number'] = (last_day - transactions['t_dat']).dt.days // 7

weekly_sales = transactions[['week_number', 'article_id']].groupby(['week_number', 'article_id'], as_index = False).agg(
    sales = ('article_id', 'count')
)
weekly_sales

Unnamed: 0,week_number,article_id,sales
0,0,0108775044,2
1,0,0111565001,25
2,0,0111586001,39
3,0,0111593001,31
4,0,0111609001,12
...,...,...,...
2201226,104,0728111001,6
2201227,104,0728146001,3
2201228,104,0728162001,35
2201229,104,0728162002,32


In [4]:
# Approximation de la tendance par moyenne glissante
last_week_sales = weekly_sales[weekly_sales['week_number'] == 0][['article_id', 'sales']].rename({
    'sales': 'last_week_sales',
},
axis = 1)

mean_sales = weekly_sales[weekly_sales['week_number'] < 6].drop('week_number', 1).groupby(['article_id'], as_index = False).agg(
    mean_sales = ('sales', 'mean')
)

trends = last_week_sales.merge(mean_sales, on = 'article_id', how = 'left')

  mean_sales = weekly_sales[weekly_sales['week_number'] < 6].drop('week_number', 1).groupby(['article_id'], as_index = False).agg(


In [18]:
# Get the scoring of the articles, based on the Last week sales and mean sales with some coefficient.

alpha = 0.15
trends['score'] = trends['last_week_sales'] + (trends['last_week_sales'] - trends['mean_sales']) * alpha

In [19]:

trends.nlargest(n = 12, columns=["score"])

Unnamed: 0,article_id,last_week_sales,mean_sales,score
17308,924243001,779,391.0,837.2
16961,918522001,581,522.666667,589.75
17309,924243002,546,286.0,585.0
17287,923758001,528,308.0,561.0
11827,866731001,487,301.166667,514.875
16264,909370001,478,536.666667,469.2
16774,915529005,433,193.25,468.9625
16773,915529003,461,412.833333,468.225
5030,762846027,416,284.25,435.7625
4496,751471001,434,520.166667,421.075


In [21]:
# Formattage de la liste.
best_articles_list = trends.nlargest(n = 12, columns=["score"])['article_id'].head(12).astype(str).to_list()
best_articles_string = ' '.join(best_articles_list)

submission = pd.read_csv("customers.csv")
submission = submission[['customer_id']]
submission['prediction'] = best_articles_string

submission.to_csv('submissions/submission_12_most_popular_with_trends_0.1.csv', index=False)
