## Assemblage du jeu de données pour les modèles uniclasse & multiclasses

#### 1. Assemblage des achats réels et shortlists 
    - Données d'entraînement : élaboration de la shortlist sur les semaines [2, 56], liste d'achat sur la semaine 1.
    - Données de validation : élaboration de la shortlist sur les semaines [1, 55], liste d'achat sur la semaine 0.
    - Données de test : élaboration de la shortlist sur les semaines [0, 54].
#### 2. Création d'un dataset pour entraînement uniclasse.
#### 3. Création d'un dataset pour entraînement multiclasse.

### Assemblage des achats réels et shortlists

In [18]:
import pandas as pd
import numpy as np

ages_centers = [15, 25, 35, 45, 55, 65]

In [3]:
transactions = pd.read_pickle('pickles/transactions.pkl')

In [4]:
customers = pd.read_pickle('pickles/customers_second_iteration.pkl')
articles = pd.read_pickle('pickles/articles_second_iteration.pkl')

In [19]:


def top_sales_cross_lists(week_sales):
    """Création des listings nécessaires à l'élaboration de la liste croisée âge / catégorie / groupe.

    Args:
        week_sales (pd.DataFrame): Les transactions pour la semaine choisie

    Returns:
        dictionnary: Listes par groupe et par catégorie, avec les pondérations par âge.
    """
    global articles
    global customers
    
    top_sales = week_sales.merge(customers, on = 'customer_id', how = 'left')
    
    top_sales = top_sales.groupby(['article_id', 'group'], as_index = False).agg(
        sales = ('article_id', 'count'),
        age_around_15 = ('age_around_15', 'sum'),
        age_around_25 = ('age_around_25', 'sum'),
        age_around_35 = ('age_around_35', 'sum'),
        age_around_45 = ('age_around_45', 'sum'),
        age_around_55 = ('age_around_55', 'sum'),
        age_around_65 = ('age_around_65', 'sum'),
    )
    
    top_sales = top_sales.merge(articles[['article_id', 'index_group_name']], on = 'article_id')
    
    top_sales['index_group_name'] = top_sales['index_group_name'].apply(lambda x: x.split('/')[0].lower())
    
    # Préparation des listes.

    groups = top_sales['group'].unique()
    categories = top_sales['index_group_name'].unique()

    cross_lists = {}

    for group in groups :
        cross_lists[group] = {}
        
        categories = top_sales['index_group_name'].unique()

        for category in categories:
            
            cross_lists[group][category] = top_sales[(top_sales['group'] == group) & (top_sales['index_group_name'] == category)]
            cross_lists[group][category]['shares'] = cross_lists[group][category]['sales'] / cross_lists[group][category]['sales'].sum()
            cross_lists[group][category]['index_group_name'] = category
            cross_lists[group][category] = cross_lists[group][category].nlargest(n = 100, columns = 'shares')
        
    
    return cross_lists

def get_cross_list(x):
    global cross_lists
    
    top_sales_weighted = []
    group = x['group']
    
    for category in cross_lists[group].keys():
    
        if x[category] == 0:
            continue
        
        cross_lists[group][category]['shares_weighted'] = cross_lists[group][category]['shares'] * x[category]
        cross_lists[group][category]['sales_cross_weighted'] = 0
            
        for age in ages_centers:
            column = f"age_around_{age}"
            
            if x[column] == 0:
                continue
            
            cross_lists[group][category]['sales_cross_weighted'] += cross_lists[group][category][column] * cross_lists[group][category]['shares_weighted']
            
            top_sales_weighted.append(cross_lists[group][category])
    
    if len(top_sales_weighted) == 0:
        return []
    
    return pd.concat(top_sales_weighted).nlargest(n = 50, columns = ['sales_cross_weighted'])['article_id'].tolist()

### Données d'entraînement
#### Liste des achats effectués

In [6]:
# Liste des achats effectués

data_train = transactions[transactions['week_number'] == 1].groupby('customer_id', as_index = False).agg(
    list_purchased = ('article_id', lambda x: list(x))
)

data_valid = transactions[transactions['week_number'] == 0].groupby('customer_id', as_index = False).agg(
    list_purchased = ('article_id', lambda x: list(x))
)

In [10]:
# Ajout des features Client nécessaires à l'élaboration des listes.
customers_cross_data = customers.drop(['FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code', 'postal_code_group', 'average_cart_articles',
       'average_cart_price', 'total_carts', 'total_articles', 'total_price',
       'average_cart_interval', 'repurchases', 'repurchases_interval'], 1)

  customers_cross_data = customers.drop(['FN', 'Active', 'club_member_status',


In [11]:
data_train = data_train.merge(customers_cross_data, on = 'customer_id', how = 'left')
data_valid = data_valid.merge(customers_cross_data, on = 'customer_id', how = 'left')

#### Liste croisée groupe / catégorie / âge

In [8]:
transactions_train = transactions[(transactions['week_number'] >= 2) & (transactions['week_number'] <= 56)]

transaction_train = transactions_train.merge(
    data_train[['customer_id']], 
    on = 'customer_id',
    how = 'inner'
)

In [9]:
transactions_valid = transactions[(transactions['week_number'] >= 1) & (transactions['week_number'] <= 55)]

transaction_valid = transactions_valid.merge(
    data_valid[['customer_id']], 
    on = 'customer_id',
    how = 'inner'
)

In [20]:
cross_lists = top_sales_cross_lists(transactions[transactions['week_number'] == 2])
data_train['cross_list'] = data_train.apply(lambda x: get_cross_list(x), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cross_lists[group][category]['shares'] = cross_lists[group][category]['sales'] / cross_lists[group][category]['sales'].sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cross_lists[group][category]['index_group_name'] = category


In [None]:
cross_lists = top_sales_cross_lists(transactions[transactions['week_number'] == 1])
data_valid['cross_list'] = data_valid.apply(lambda x: get_cross_list(x), axis = 1)

#### Articles appairés