In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import dgl
import torch as th

from sklearn.preprocessing import LabelEncoder

### Préparation des interactions Client - Article

- Extraction de N identifiants utilisateurs 
- Split entre le jeu d'entraînement et de test
- Compilation des features

In [2]:
transactions = pd.read_pickle('pickles/transactions.pkl')
transactions = transactions.sample(frac = 0.01)

In [10]:
customers = pd.read_pickle('pickles/customers_second_iteration.pkl')
articles = pd.read_pickle('pickles/articles_second_iteration.pkl')

In [40]:
customers['fashion_news_frequency'].replace({'None': 'NONE'}, inplace = True)

customers.drop(columns = ['baby', 'divided', 'ladieswear', 'menswear', 'age', 'average_cart_articles',
       'average_cart_price', 'total_carts', 'total_articles', 'total_price', 'sport', 'repurchases'], axis = 1, inplace = True)

# Typer les champs de catégorie
customers[['fashion_news_frequency','club_member_status']] = customers[['fashion_news_frequency','club_member_status']].astype('category')

In [41]:
from utils.dummify import Dummify

for column in customers.columns:
    if not isinstance(customers[column].dtype, pd.CategoricalDtype):
        continue

    dummies = pd.get_dummies(
        customers[column], prefix=column, prefix_sep=":")
    
    customers = pd.concat([customers, dummies], axis=1)
    customers.drop(columns=[column], axis=1, inplace=True)


# Nan values for average_cart_interval / repurchases_interval
customers = customers.fillna(0)


In [9]:
customers.to_pickle('pickles/customers_gnn_full.pkl')

In [11]:
from utils.dummify import Dummify
categories = ['product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name']

columns_to_drop = ['total_purchases', 'average_quantity',
       'average_price', 'age_around_15', 'age_around_25',
       'age_around_35', 'age_around_45', 'age_around_55', 'age_around_65',
       'repurchases', 'repurchase_interval']

articles.drop(columns = columns_to_drop, axis = 1, inplace = True)

articles[categories] = articles[categories].astype('category')

dummify = Dummify()

articles = dummify.transform(articles)

articles['has_image'] = articles['has_image'].astype(int)

In [13]:
articles.to_pickle('pickles/articles_gnn_full.pkl')
articles.head(1)

Unnamed: 0,article_id,has_image,product_type_name:Accessories set,product_type_name:Alice band,product_type_name:Baby Bib,product_type_name:Backpack,product_type_name:Bag,product_type_name:Ballerinas,product_type_name:Beanie,product_type_name:Belt,...,garment_group_name:Shorts,garment_group_name:Skirts,garment_group_name:Socks and Tights,garment_group_name:Special Offers,garment_group_name:Swimwear,garment_group_name:Trousers,garment_group_name:Trousers Denim,"garment_group_name:Under-, Nightwear",garment_group_name:Unknown,garment_group_name:Woven/Jersey/Knitted mix Baby
0,108775015,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(5)

neg_sampler = dgl.dataloading.as_edge_prediction_sampler(
    sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(1))

dataloader = dgl.dataloading.DataLoader(
    graph, 
    {'customer': th.tensor([0,1,2,3,4,5,6,7,8,9,10], dtype = th.int32)}, 
    sampler,
    batch_size=1024,
    shuffle=True,
    drop_last=False)

negative_dataloader = dgl.dataloading.DataLoader(
    graph, 
    {
        'will-buy': th.tensor(purchases_to_predict[purchases_to_predict['validation_set'] == True].index.values, dtype = th.int32)
    },
    neg_sampler,
    batch_size=10,
    shuffle=True,
    drop_last=False,
    pin_memory=True)



### Essai sur la métrique

In [53]:
import torch as th

customers_index = th.tensor([11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
articles_index = th.tensor([18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

customers = th.randn((12, 32))
articles = th.randn((18, 32))
cos = th.nn.CosineSimilarity(dim=1, eps=1e-6)


output = cos(
    customers.reshape(-1, 32, 1), 
    articles.reshape(1, 32, -1)
)
sorted = th.argsort(output, dim = 1, descending = True)
sorted = sorted[:, 0:12]

print(sorted)
print(articles_index[sorted[:]])

reindexed = articles_index[sorted[:]]

reindexed[customers_index]

tensor([[ 9, 11,  6,  4, 14, 13, 16, 10, 17,  0,  3,  8],
        [12,  2, 15, 16, 17,  9, 11,  4,  8,  6,  1,  7],
        [17,  2,  6, 16, 14, 12,  1,  9, 13, 15, 11,  4],
        [ 4, 10, 17,  2, 13,  9,  3, 12,  8,  0, 11,  5],
        [ 4,  3,  9,  0,  7,  8, 17,  6, 14,  5, 10, 13],
        [ 0, 15,  1,  2, 17, 11,  9, 14, 16, 12,  3,  6],
        [ 7,  6, 12, 10, 15,  8, 14,  1,  4,  0,  9, 11],
        [ 9,  8,  2, 16,  6, 15, 12,  1, 11,  0,  5,  7],
        [ 0, 11,  2, 12,  1, 15, 14,  5,  9, 16,  3,  6],
        [ 4,  1, 10,  0, 11,  5, 12,  7,  8,  6, 13,  9],
        [12,  8,  7,  4,  9, 10,  0, 14,  1,  3,  2, 11],
        [ 4,  3,  5, 16, 17, 10, 12,  7,  0, 13, 15,  2]])
tensor([[ 9,  7, 12, 14,  4,  5,  2,  8,  1, 18, 15, 10],
        [ 6, 16,  3,  2,  1,  9,  7, 14, 10, 12, 17, 11],
        [ 1, 16, 12,  2,  4,  6, 17,  9,  5,  3,  7, 14],
        [14,  8,  1, 16,  5,  9, 15,  6, 10, 18,  7, 13],
        [14, 15,  9, 18, 11, 10,  1, 12,  4, 13,  8,  5],
        [18, 

tensor([[14, 15, 13,  2,  1,  8,  6, 11, 18,  5,  3, 16],
        [ 6, 10, 11, 14,  9,  8, 18,  4, 17, 15, 16,  7],
        [14, 17,  8, 18,  7, 13,  6, 11, 10, 12,  5,  9],
        [18,  7, 16,  6, 17,  3,  4, 13,  9,  2, 15, 12],
        [ 9, 10, 16,  2, 12,  3,  6, 17,  7, 18, 13, 11],
        [11, 12,  6,  8,  3, 10,  4, 17, 14, 18,  9,  7],
        [18,  3, 17, 16,  1,  7,  9,  4,  2,  6, 15, 12],
        [14, 15,  9, 18, 11, 10,  1, 12,  4, 13,  8,  5],
        [14,  8,  1, 16,  5,  9, 15,  6, 10, 18,  7, 13],
        [ 1, 16, 12,  2,  4,  6, 17,  9,  5,  3,  7, 14],
        [ 6, 16,  3,  2,  1,  9,  7, 14, 10, 12, 17, 11],
        [ 9,  7, 12, 14,  4,  5,  2,  8,  1, 18, 15, 10]])

In [69]:
df = pd.Series(reindexed[customers_index].tolist()).rename('prediction')
pd.concat([df, df], axis = 1)

Unnamed: 0,prediction,prediction.1
0,"[14, 15, 13, 2, 1, 8, 6, 11, 18, 5, 3, 16]","[14, 15, 13, 2, 1, 8, 6, 11, 18, 5, 3, 16]"
1,"[6, 10, 11, 14, 9, 8, 18, 4, 17, 15, 16, 7]","[6, 10, 11, 14, 9, 8, 18, 4, 17, 15, 16, 7]"
2,"[14, 17, 8, 18, 7, 13, 6, 11, 10, 12, 5, 9]","[14, 17, 8, 18, 7, 13, 6, 11, 10, 12, 5, 9]"
3,"[18, 7, 16, 6, 17, 3, 4, 13, 9, 2, 15, 12]","[18, 7, 16, 6, 17, 3, 4, 13, 9, 2, 15, 12]"
4,"[9, 10, 16, 2, 12, 3, 6, 17, 7, 18, 13, 11]","[9, 10, 16, 2, 12, 3, 6, 17, 7, 18, 13, 11]"
5,"[11, 12, 6, 8, 3, 10, 4, 17, 14, 18, 9, 7]","[11, 12, 6, 8, 3, 10, 4, 17, 14, 18, 9, 7]"
6,"[18, 3, 17, 16, 1, 7, 9, 4, 2, 6, 15, 12]","[18, 3, 17, 16, 1, 7, 9, 4, 2, 6, 15, 12]"
7,"[14, 15, 9, 18, 11, 10, 1, 12, 4, 13, 8, 5]","[14, 15, 9, 18, 11, 10, 1, 12, 4, 13, 8, 5]"
8,"[14, 8, 1, 16, 5, 9, 15, 6, 10, 18, 7, 13]","[14, 8, 1, 16, 5, 9, 15, 6, 10, 18, 7, 13]"
9,"[1, 16, 12, 2, 4, 6, 17, 9, 5, 3, 7, 14]","[1, 16, 12, 2, 4, 6, 17, 9, 5, 3, 7, 14]"


In [2]:
import pandas as pd
articles = pd.read_pickle('pickles/articles_gnn_full.pkl')
customers = pd.read_pickle('pickles/customers_gnn_full.pkl')

In [4]:
customers.isna().sum()

customer_id                              0
FN                                       0
Active                                   0
average_cart_interval               457672
repurchases_interval                  9699
age_around_15                            0
age_around_25                            0
age_around_35                            0
age_around_45                            0
age_around_55                            0
age_around_65                            0
postal_code_group                        0
group                                    0
club_member_status:ACTIVE                0
club_member_status:LEFT CLUB             0
club_member_status:NON ACTIVE            0
club_member_status:PRE-CREATE            0
fashion_news_frequency:Monthly           0
fashion_news_frequency:NONE              0
fashion_news_frequency:Regularly         0
dtype: int64