## Entraînement de modèles uniclasse et multilabels

Sur la base des données assemblées sur le notebook `second_iteration_dataset.ipynb`

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

### 1. Modèle uniclasse
#### XGBoost

In [3]:
data_uniclass = pd.read_pickle('pickles/second_iteration_data_train_uniclass.pkl')
data_uniclass.head(5)

Unnamed: 0,customer_id,article_id,label,in_pair_list,in_repurchase_list,in_cross_list,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,...,repurchases_customer,repurchases_interval,age_around_15_customer,age_around_25_customer,age_around_35_customer,age_around_45_customer,age_around_55_customer,age_around_65_customer,postal_code_group,group
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,748269009,0,100,0,100,Top,Garment Upper body,Solid,Light Beige,...,2,1.0,0.0,0.8,0.2,0.0,0.0,0.0,1,1
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,881919001,0,100,1,100,Sweater,Garment Upper body,Solid,Off White,...,2,1.0,0.0,0.8,0.2,0.0,0.0,0.0,1,1
2,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,895610005,0,100,2,100,Trousers,Garment Lower body,Solid,Light Beige,...,2,1.0,0.0,0.8,0.2,0.0,0.0,0.0,1,1
3,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,902229003,0,100,3,100,Top,Garment Upper body,Solid,Black,...,2,1.0,0.0,0.8,0.2,0.0,0.0,0.0,1,1
4,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,559601019,0,100,4,100,Swimwear bottom,Swimwear,Other structure,White,...,2,1.0,0.0,0.8,0.2,0.0,0.0,0.0,1,1


In [4]:
# Categorical fields
categories = ["product_type_name", "product_group_name", 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'club_member_status', 'fashion_news_frequency']

for category in categories:
    data_uniclass[category] = data_uniclass[category].astype('category')

In [5]:
# Division du dataset en entraînement / validation
# Doit être fait par client afin de préserver le scoring MAP@12.
customers = pd.Series(data_uniclass['customer_id'].unique()).rename('customer_id')

customers_train, customers_valid = train_test_split(customers, test_size=0.2)

In [6]:
data_train = data_uniclass.merge(customers_train, on='customer_id', how='inner')
data_valid = data_uniclass.merge(customers_valid, on='customer_id', how='inner')

X_valid = data_valid.drop(['label'], axis = 1)
Y_valid = data_valid['label']

In [7]:
# Rééquilibrage du jeu de données d'entraînement
data_train_pos = data_train[data_train['label'] == 1]
data_train_neg = data_train[data_train['label'] == 0]

In [8]:
# On garde quand même un nombre supérieur de labels négatifs afin de ne pas être trop éloigné de la réalité des données de test. 
ratio = 2
 
data_train_neg = data_train_neg.sample(n = len(data_train_pos) * ratio)

In [9]:
data_train_balanced = pd.concat([data_train_pos, data_train_neg])
data_train_balanced.reset_index(drop = True, inplace = True)

In [10]:
X_train = data_train_balanced.drop(['label'], axis = 1)
Y_train = data_train_balanced['label']

In [11]:
### CHECKPOINT
X_train.to_pickle('pickles/data_uniclass_train_X.pkl')
X_valid.to_pickle('pickles/data_uniclass_valid_X.pkl')
Y_train.to_pickle('pickles/data_uniclass_train_Y.pkl')
Y_valid.to_pickle('pickles/data_uniclass_valid_Y.pkl')

In [38]:
X_train = pd.read_pickle('pickles/data_uniclass_train_X.pkl')
X_valid = pd.read_pickle('pickles/data_uniclass_valid_X.pkl')
Y_train = pd.read_pickle('pickles/data_uniclass_train_Y.pkl')
Y_valid = pd.read_pickle('pickles/data_uniclass_valid_Y.pkl')

In [12]:
train = xgb.DMatrix(data=X_train.drop(['customer_id', 'article_id'], axis = 1), label=Y_train, enable_categorical = True)
valid = xgb.DMatrix(data=X_valid.drop(['customer_id', 'article_id'], axis = 1), label=Y_valid, enable_categorical = True)

In [13]:
params = {'booster': 'gbtree', 'learning_rate': 0.05, 'objective': 'binary:logistic'}
xgb1 = xgb.train(params = params, dtrain = train,num_boost_round=3200, evals=[(train, 'train'), (valid, 'eval')])

[0]	train-logloss:0.68233	eval-logloss:0.67335
[1]	train-logloss:0.67256	eval-logloss:0.65490
[2]	train-logloss:0.66375	eval-logloss:0.63798
[3]	train-logloss:0.65553	eval-logloss:0.62198
[4]	train-logloss:0.64801	eval-logloss:0.60738
[5]	train-logloss:0.64111	eval-logloss:0.59362
[6]	train-logloss:0.63469	eval-logloss:0.58073
[7]	train-logloss:0.62875	eval-logloss:0.56869
[8]	train-logloss:0.62324	eval-logloss:0.55749
[9]	train-logloss:0.61828	eval-logloss:0.54703
[10]	train-logloss:0.61355	eval-logloss:0.53707
[11]	train-logloss:0.60923	eval-logloss:0.52782
[12]	train-logloss:0.60515	eval-logloss:0.51905
[13]	train-logloss:0.60133	eval-logloss:0.51083
[14]	train-logloss:0.59778	eval-logloss:0.50312
[15]	train-logloss:0.59443	eval-logloss:0.49582
[16]	train-logloss:0.59133	eval-logloss:0.48888
[17]	train-logloss:0.58849	eval-logloss:0.48246
[18]	train-logloss:0.58576	eval-logloss:0.47636
[19]	train-logloss:0.58324	eval-logloss:0.47058
[20]	train-logloss:0.58086	eval-logloss:0.46516
[2

In [14]:
preds = xgb1.predict(valid)

xgbpreds = pd.Series(np.where(preds > 0.5, 1, 0))

pd.crosstab(Y_valid.reset_index(drop = True), xgbpreds, rownames=['Classe réelle'], colnames=['Classe prédite'])

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,634704,55721
1,2433,948


**Score MAP@12**

In [15]:
score_table = pd.concat([
    X_valid['customer_id'].reset_index(drop = True), 
    Y_valid.rename('label').reset_index(drop = True), 
    pd.Series(preds).rename('prediction')
], axis = 1).sort_values(
    ['customer_id', 'prediction'], ascending = False
)

In [16]:
from utils.map12_from_score_table import map12_from_score_table

map12 = map12_from_score_table(score_table)

print("Score MAP@12: ", map12['map12'].sum() / len(map12))

  lambda x: np.sum(


Score MAP@12:  0.022188921654841473


### Entraînement sur jeu de données multilabel

In [3]:
data_multiclass = pd.read_pickle('pickles/second_iteration_data_train_multilabel.pkl')

In [4]:
categories = ['club_member_status', 'fashion_news_frequency', 'club_member_status', 'FN']

categories_article = ["product_type_name", "product_group_name", 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name']

for category in categories_article:
    categories += [f"{category}"]
    
    for i in range(1, 50):
        categories += [f"{category}_{i}"]


#for category in categories:
#    data_multiclass[category] = data_multiclass[category].astype('category')
    
data_multiclass.drop(columns = categories, axis = 1, inplace = True)

In [7]:
data_train, data_valid = train_test_split(data_multiclass, test_size=0.2)

data_train.reset_index(drop = True, inplace = True)
data_valid.reset_index(drop = True, inplace = True)

In [8]:
not_features_columns = ['customer_id', 'article_id', 'purchased_list', 'cross_list', 'pair_list', 'repurchase_list', 'shortlist', 'shortlist_length', 'postal_code'] + [f"article_id_{i}" for i in range(0, 50)] + [f"{i}_label" for i in range(0, 50)]
labels_columns = [f"{i}_label" for i in range(0, 50)]

X_train = data_train.drop(not_features_columns, axis = 1)
Y_train = data_train[labels_columns]
X_valid = data_valid.drop(not_features_columns, axis = 1)
Y_valid = data_valid[labels_columns]

In [9]:
### CHECKPOINT
X_train.to_pickle('pickles/data_multilabel_train_X.pkl')
X_valid.to_pickle('pickles/data_multilabel_valid_X.pkl')
Y_train.to_pickle('pickles/data_multilabel_train_Y.pkl')
Y_valid.to_pickle('pickles/data_multilabel_valid_Y.pkl')
data_train.to_pickle('pickles/data_multilabel_train.pkl')
data_valid.to_pickle('pickles/data_multilabel_valid.pkl')

In [2]:
import pandas as pd 
import xgboost as xgb 

X_train = pd.read_pickle('pickles/data_multilabel_train_X.pkl')
X_valid = pd.read_pickle('pickles/data_multilabel_valid_X.pkl')
Y_train = pd.read_pickle('pickles/data_multilabel_train_Y.pkl')
Y_valid = pd.read_pickle('pickles/data_multilabel_valid_Y.pkl')
data_train = pd.read_pickle('pickles/data_multilabel_train.pkl')
data_valid = pd.read_pickle('pickles/data_multilabel_valid.pkl')

In [7]:
score_chunks = []
import time

for i in range(0, 50):
    print(f"Processing label {i}")
    
    train = xgb.DMatrix(data=X_train, label=Y_train[f"{i}_label"], enable_categorical = True)
    valid = xgb.DMatrix(data=X_valid, label=Y_valid[f"{i}_label"], enable_categorical = True)
    
    params = {'booster': 'gbtree', 'learning_rate': 0.05, 'objective': 'binary:logistic'}
    xgb1 = xgb.train(params = params, dtrain = train, num_boost_round=200, evals=[(train, 'train'), (valid, 'eval')])
    
    prediction = pd.Series(xgb1.predict(valid)).rename('prediction')
    labels = Y_valid[f"{i}_label"].rename('label')
    
    score_chunks.append(
        pd.concat([data_valid['customer_id'], labels, prediction], axis = 1)
    )
    
    time.sleep(2)


  transformed[data.columns[i]] = data[data.columns[i]]


[0]	train-logloss:0.64542	eval-logloss:0.64547
[1]	train-logloss:0.60228	eval-logloss:0.60233
[2]	train-logloss:0.56302	eval-logloss:0.56311
[3]	train-logloss:0.52720	eval-logloss:0.52732
[4]	train-logloss:0.49439	eval-logloss:0.49453
[0]	train-logloss:0.64548	eval-logloss:0.64554
[1]	train-logloss:0.60233	eval-logloss:0.60247
[2]	train-logloss:0.56314	eval-logloss:0.56329
[3]	train-logloss:0.52732	eval-logloss:0.52753
[4]	train-logloss:0.49453	eval-logloss:0.49479
[0]	train-logloss:0.64549	eval-logloss:0.64548
[1]	train-logloss:0.60239	eval-logloss:0.60236
[2]	train-logloss:0.56317	eval-logloss:0.56315
[3]	train-logloss:0.52740	eval-logloss:0.52737
[4]	train-logloss:0.49463	eval-logloss:0.49460


In [15]:
from sklearn.multioutput import MultiOutputClassifier
classifier = MultiOutputClassifier(xgb.XGBClassifier(tree_method='approx', learning_rate = 0.05, enable_categorical=True))

classifier.fit(X_train, Y_train)



ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [67]:
pred_proba = classifier.predict_proba(X_valid)


In [70]:
# Mise en forme du résultat
prediction = []

rows = pred_proba[0].shape[0]
labels = len(pred_proba)

for i in range(0, rows) :
    row = []
    for j in range (0, labels) :
        row.append(pred_proba[j][i][1])
    
    prediction.append(row)
    
prediction_dataframe = pd.DataFrame(prediction)
    

In [73]:
score_list = []

def populate_score_list(row):
    global score_list
    
    for i in range(0, 50):
        score_list.append([
            row['customer_id'],
            row[f'{i}_label'],
            row[i]
        ])

blank = pd.concat([data_valid, prediction_dataframe], axis = 1).apply(lambda x: populate_score_list(x), axis = 1)

In [74]:
score_table = pd.DataFrame(score_list, columns = ['customer_id', 'label', 'prediction'])

In [75]:
from utils.map12_from_score_table import map12_from_score_table

map12 = map12_from_score_table(score_table)

print("Score MAP@12: ", map12['map12'].sum() / len(map12))


  lambda x: np.sum(


Score MAP@12:  0.0984729760051194
