## Assemblage du jeu de données pour les modèles uniclasse & multilabel

#### 1. Assemblage des achats réels et shortlists 
    - Données d'entraînement / validation : élaboration de la shortlist sur les semaines [1, 56], liste d'achat sur la semaine 0.
    - Données de test : élaboration de la shortlist à partir des sélections calculées précédemment.
#### 2. Création d'un chunk pour entraînement uniclasse.
#### 3. Création d'un chunk pour entraînement multilabels.

### Assemblage des achats réels et shortlists

In [1]:
import pandas as pd
import numpy as np
import swifter

%load_ext autoreload
%autoreload 2

ages_centers = [15, 25, 35, 45, 55, 65]

In [3]:
data_train = pd.read_pickle('pickles/article_lists_train.pkl')
data_train.head()

Unnamed: 0,customer_id,purchase_list,cross_list,pair_list,repurchase_list,length,shortlist
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001],"[0909370001, 0924243001, 0865799006, 091852200...","[0111593001, 0111593001, 0123173001, 011158600...",,100,"[0111593001, 0123173001, 0111586001, 015834000..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[0827487003],"[0805947001, 0809238005, 0809238001, 071479002...","[0377277002, 0759871002, 0759871002, 075987100...","[0621381012, 0880017001, 0640021012]",100,"[0621381012, 0880017001, 0640021012, 037727700..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[0640021019, 0757926001, 0788575004]","[0448509014, 0706016002, 0715624001, 070601600...","[0759871002, 0759871002, 0759871002, 070601600...","[0556255001, 0399136061, 0732842021, 073284201...",100,"[0556255001, 0399136061, 0732842021, 073284201..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[0874110016],"[0809238005, 0809238001, 0448509014, 080594700...",,,100,"[0809238005, 0809238001, 0448509014, 080594700..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[0915529005, 0158340001, 0448509014, 048663900...","[0909370001, 0918522001, 0924243001, 086579900...","[0158340001, 0759871002, 0759871001, 070601600...","[0806388001, 0933989002, 0456163060, 092974400...",100,"[0806388001, 0933989002, 0456163060, 092974400..."


In [4]:
# Création d'un jeu de données avec une ligne par couple Client-Article.
from utils.list_to_uniclass import ListToUniclass

list_to_uniclass = ListToUniclass()

data_train_uniclass = list_to_uniclass.fit_transform(data_train)

Pandas Apply: 100%|██████████| 68439/68439 [01:18<00:00, 874.52it/s]


In [5]:
# Réduction du nombre des échantillons 
data_train_neg = data_train_uniclass[data_train_uniclass['label'] == 0]
data_train_pos = data_train_uniclass[data_train_uniclass['label'] == 1]

data_train_neg = data_train_neg.sample(n = len(data_train_pos) * 50)

data_train_uniclass = pd.concat([data_train_neg, data_train_pos])

In [3]:
# Ajout des features.
from utils.add_uniclass_features import AddUniclassFeatures

articles = pd.read_pickle('pickles/articles_second_iteration.pkl')
customers = pd.read_pickle('pickles/customers_second_iteration.pkl')


In [8]:
add_uniclass_features = AddUniclassFeatures(articles, customers)
data_train_uniclass = add_uniclass_features.fit_transform(data_train_uniclass)

Ajout de features 


In [12]:
data_train_uniclass['age_ratio'] = data_train_uniclass.swifter.apply(lambda x: 
    x['age_around_15_customer'] * x['age_around_15_article'] +
    x['age_around_25_customer'] * x['age_around_25_article'] +
    x['age_around_35_customer'] * x['age_around_35_article'] +
    x['age_around_45_customer'] * x['age_around_45_article'] +
    x['age_around_55_customer'] * x['age_around_55_article'] +
    x['age_around_65_customer'] * x['age_around_65_article']
, axis = 1)

In [14]:
data_train_uniclass['index_ratio'] = data_train_uniclass.swifter.apply(lambda x: 
    x[x['index_group_name'].lower().split('/')[0]]
, axis = 1)

Pandas Apply: 100%|██████████| 1843701/1843701 [00:14<00:00, 124966.48it/s]


In [16]:
data_train_uniclass['label'].value_counts()

0    1807550
1      36151
Name: label, dtype: int64

-> Le jeu de données est déséquilibré en l'état, il faudra certainement équilibrer les données d'entraînement pour obtenir un résultat optimal.

In [17]:
data_train_uniclass.to_pickle('pickles/second_iteration_data_train_uniclass.pkl')

In [18]:
data_train_uniclass.head(5)

Unnamed: 0,customer_id,article_id,label,in_pair_list,in_repurchase_list,in_cross_list,product_code,prod_name,product_type_name,product_group_name,...,age_around_15_customer,age_around_25_customer,age_around_35_customer,age_around_45_customer,age_around_55_customer,age_around_65_customer,postal_code_group,group,age_ratio,index_ratio
0,ed99cc34c86d4871bfddddc012f0ce2bac9ad14c7fc61f...,714790020,0,100,100,27,714790,Mom Fit Ultra HW,Trousers,Garment Lower body,...,0.0,0.7,0.3,0.0,0.0,0.0,5,9,0.389226,0.3
1,55b4916d32a641e3626a4da567bead084c62ca4ab97d58...,871710001,0,100,100,46,871710,SPEED BRUNO SHIRT,Shirt,Garment Upper body,...,0.2,0.8,0.0,0.0,0.0,0.0,3,9,0.45637,0.222222
2,c9b428898d8353d2ae7470ef420b708825c13be2fecd41...,935892001,0,100,100,72,935892,LW (J) Conc PRICE SWEATSHIRT,Bra,Underwear,...,0.5,0.5,0.0,0.0,0.0,0.0,5,0,0.192645,0.933333
3,67852e732eda81a5f8e5661fe114e5ca17b870ba2df063...,797710006,0,100,100,63,797710,Teddy turtleneck,Top,Garment Upper body,...,0.0,0.0,0.0,0.4,0.6,0.0,4,5,0.232025,0.846827
4,0b423caa0ad781da963730635e7eac1c7653662ef8f7ee...,923758001,0,100,100,14,923758,Vanessa,Unknown,Unknown,...,0.0,0.0,0.0,0.3,0.7,0.0,4,4,0.156613,0.717949


### Création du dataset pour entraînement multilabel

In [2]:
from sklearn.base import TransformerMixin

class ListToMultiLabel(TransformerMixin):
    def __init__(self, articles, customers):
        self.articles = articles
        self.customers = customers
        
        self.articles['article_id'].astype("int32")
        return
    
    def fit(self, dataset):
        return self
        
    def transform(self, dataset):
        
        dataset = dataset.merge(customers, on = 'customer_id', how = 'left')

        chunks = []
        
        common_columns = ['purchase_list', 'cross_list', 'pair_list',
       'repurchase_list', 'length', 'shortlist', 'shortlist_length', *customers.columns]

        for i in range(0, 100):
            print(f"\r Process item n°{i + 1}", end = "")
            
            chunk = dataset.copy()
            
            chunk[f"article_id_{i}"] = chunk['shortlist'].apply(lambda x: x[i])
            chunk[f"{i}_label"] = chunk.apply(lambda x: 1 if x[f"article_id_{i}"] in x['purchase_list'] else 0, axis = 1)
            chunk[f"{i}_in_pair_list"] = chunk.apply(lambda x: 1 if x[f"article_id_{i}"] in x['pair_list'] else 0, axis = 1)
            chunk[f"{i}_in_repurchase_list"] = chunk.apply(lambda x: 1 if x[f"article_id_{i}"] in x['repurchase_list'] else 0, axis = 1)
            chunk[f"{i}_in_cross_list"] = chunk.apply(lambda x: 1 if x[f"article_id_{i}"] in x['cross_list'] else 0, axis = 1)
            
            chunk[f"article_id_{i}"].astype('int32')
            
            chunk = chunk.merge(articles, left_on = f"article_id_{i}", right_on = 'article_id', how = 'left', suffixes = ('', f'_{i}'))
            
            chunk[f"article_id_{i}"] = "0" + chunk[f"article_id_{i}"].astype(str)
            
            chunk[f'age_ratio_{i}'] = chunk.apply(lambda x: 
                x['age_around_15'] * x[f'age_around_15_{i}'] +
                x['age_around_25'] * x[f'age_around_25_{i}'] +
                x['age_around_35'] * x[f'age_around_35_{i}'] +
                x['age_around_45'] * x[f'age_around_45_{i}'] +
                x['age_around_55'] * x[f'age_around_55_{i}'] +
                x['age_around_65'] * x[f'age_around_65_{i}']
            , axis = 1)
            
            chunk[f'index_ratio_{i}'] = chunk.apply(lambda x: 
                x[x['index_group_name'].lower().split('/')[0]]
            , axis = 1)
            
            chunk.drop(columns = common_columns, axis = 1, inplace = True)
            
            chunks.append(chunk)
            
        return pd.concat([dataset, *chunks], axis = 1).drop(columns = ['purchase_list', 'cross_list', 'pair_list',
       'repurchase_list', 'length', 'shortlist', 'shortlist_length'], axis = 1)
            
            
            

In [4]:
data_train = pd.read_pickle('pickles/article_lists_train.pkl')

In [5]:
# Nettoyage.
data_train['shortlist_length'] = data_train['shortlist'].apply(lambda x: len(x))
data_train = data_train[data_train['shortlist_length'] >= 100]

In [6]:
list_to_multilabel = ListToMultiLabel(articles, customers)

data_train_multiclass = list_to_multilabel.fit_transform(data_train)

 Process item n°100

: 

: 

In [19]:
dataset = data_train.merge(customers, on = 'customer_id', how = 'left')

In [21]:
dataset.drop(columns = ['purchase_list', 'cross_list', 'pair_list',
       'repurchase_list', 'length', 'shortlist', 'shortlist_length'], axis = 1, inplace = True)

In [22]:
data_train_multiclass = pd.concat([dataset, data_train_multiclass], axis = 1)

In [23]:
data_train_multiclass.to_pickle('pickles/second_iteration_data_train_multilabel.pkl')
data_train_multiclass.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,average_cart_articles,average_cart_price,total_carts,...,age_around_15_99,age_around_25_99,age_around_35_99,age_around_45_99,age_around_55_99,age_around_65_99,repurchases_99,repurchase_interval,age_ratio_99,index_ratio_{i}
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,0,0,ACTIVE,,27,43cbf97df3d118b937551fb21a08d513bfb2e58223315f...,3.875,0.112502,8,...,0.077955,0.420791,0.240617,0.126628,0.098456,0.02465,0.039074,0.069786,0.384756,0.645161
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1,1,ACTIVE,Regularly,33,d647e4ede3d0eb4ce0750440a110350b5f4c758165d89d...,2.25,0.067388,8,...,0.089544,0.487208,0.208454,0.118799,0.077642,0.012236,0.066741,0.033185,0.264205,0.944444
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,0,0,ACTIVE,,29,72afbb92c0200628bfa8f983c241eb0dc14e107f87d95b...,4.571429,0.140893,7,...,0.066545,0.302182,0.202909,0.231091,0.164,0.027636,0.045455,0.021818,0.262473,0.875
3,000525e3fe01600d717da8423643a8303390a055c578ed...,1,1,ACTIVE,Regularly,25,ed323346483de9f9b9ac7d73d34e0c87b5946d09da3b07...,1.0,0.025407,1,...,0.133696,0.3707,0.152262,0.170551,0.140461,0.02405,0.054204,0.502668,0.3707,1.0
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0,0,ACTIVE,,23,3119ea10ffe5ac3419b9127589a61b33e1ae38ecbb997b...,3.272727,0.108072,22,...,0.15026,0.432102,0.079084,0.108897,0.163736,0.049584,0.058273,0.04145,0.375734,0.694444
