In [1]:
import math
import datetime

import click
import numpy as np
import torch
from dgl.data.utils import save_graphs
from environment import Environment
from parameters import Parameters
from src.classes.dataset import Dataset
from src.get_embeddings import get_embeddings
from src.max_margin_loss import max_margin_loss
from src.model.conv_model import ConvModel
from src.train_loop import train_loop

from src.utils_data import assign_graph_features
from src.utils import read_data, save_txt, save_outputs
from src.utils_vizualization import plot_train_loss
from src.metrics import (create_already_bought, create_ground_truth,
                         get_metrics_at_k, get_recommendation_tensor, precision_at_k)
from src.evaluation import explore_recs, explore_sports, check_coverage
from presplit import presplit_data

from logging_config import get_logger

log = get_logger(__name__)
%load_ext autoreload
%autoreload 2

### Préparation des données et du modèle

In [2]:
environment = Environment()
parameters = Parameters({
    'aggregator_hetero': 'mean',
    'aggregator_type': 'mean',
    'clicks_sample': 0.3,
    'delta': 0.266,
    'dropout': 0.01,
    'hidden_dim': 256,
    'out_dim': 128,
    'embedding_layer': True,
    'edge_batch_size': 2048,
    'embedding_batch_size': 2048,
    'lr': 0.00017985194246308484,
    'n_layers': 4,
    'neg_sample_size': 10,
    'norm': True,
    'use_popularity': True,
    'weight_popularity': 0.5,
    'days_popularity': 7,
    'purchases_sample': 0.5,
    'prediction_layer': 'cos',
    'use_recency': True,
    'num_workers': 1,
    "partial_sampling_num_neighbors" : 5,
    'num_epochs': 20
})


In [3]:
# Create full train set
dataset = Dataset(
    environment, parameters
)

In [4]:
from src.classes.graphs import Graphs

graphs = Graphs(dataset, parameters)

In [5]:
from src.get_dimension_dictionnary import get_dimension_dictionnary

dim_dict = get_dimension_dictionnary(graphs, parameters)
dim_dict

{'customer': 19, 'article': 600, 'edge': 5, 'out': 128, 'hidden': 256}

In [7]:
from src.classes.dataloaders import DataLoaders

dataloaders = DataLoaders(graphs,
                    dataset,
                    parameters,
                    environment
                    )

In [8]:
model = ConvModel(dim_dict,
                    parameters
                    )


In [9]:
trained_model, viz, best_metrics = train_loop(
    model=model,
    graphs=graphs,
    dataset=dataset,
    dataloaders=dataloaders,
    loss_fn=max_margin_loss,
    get_metrics=True,
    parameters=parameters,
    environment=environment,
)

0 20
Starting training.
Epoch 00000 || TRAINING Loss 0.04450 | Precision at k 0.013% || VALIDATION Loss 0.26514 | Precision at k 0.014% 
Epoch took 0:00:47.045556 

 Process valid batches...              Epoch 00001 | Training Loss 0.04375 | Validation Loss 0.26514 | 
Epoch took 0:00:06.289339 

Process valid batch 22 / 27             Epoch 00002 | Training Loss 0.04312 | Validation Loss 0.25957 | 
Epoch took 0:00:07.133943 

Train batch 1 / 27 : Get embeddings...                   

KeyboardInterrupt: 

In [15]:
                
batch_index = 0
valid_precision_at_k = 0

customers_per_batch = 200
current_index = 0
length = len(dataset.customers_nid_valid)

precision_list = np.array([])
recommendation_chunks = []

while current_index < length :
    
    customer_nids = dataset.customers_nid_valid[current_index: current_index + customers_per_batch]
    
    print(f"\rProcessing valid recommendations for customers {current_index} - {current_index + customers_per_batch}", end = "")
    new_recommendations = get_recommendation_tensor({
        'article': graphs.prediction_graph.nodes['article'].data['h'].to(environment.device),
        'customer': graphs.prediction_graph.nodes['customer'].data['h'][customer_nids].to(environment.device),
    }, parameters, environment)
    
    recommendation_chunks.append(new_recommendations)

    if current_index % 5000 == 0:
        recommendations = torch.cat(recommendation_chunks, dim = 0)
        
        precision = precision_at_k(recommendations, customer_nids, dataset)
        precision_list = np.append(precision_list, precision)
        
        recommendation_chunks = []
    
    current_index += customers_per_batch

Processing valid recommendations for customers 43200 - 43400

In [17]:
len(dataset.customers_nid_valid)

13797

In [11]:
customer_nids = dataset.customers_nid_valid[20000:20200]
new_recommendations = get_recommendation_tensor({
    'article': graphs.prediction_graph.nodes['article'].data['h'].to(environment.device),
    'customer': graphs.prediction_graph.nodes['customer'].data['h'][customer_nids].to(environment.device),
}, parameters, environment)
new_recommendations

tensor([], device='cuda:0', size=(0, 12), dtype=torch.int32)

In [1]:
opt = parameters.optimizer(model.parameters(),
                            lr=parameters.lr)


NameError: name 'parameters' is not defined

In [46]:
embeddings = model.get_embeddings(graphs.history_graph, {
    'article': graphs.history_graph.nodes['article'].data['features'],
    'customer': graphs.history_graph.nodes['customer'].data['features'],
})

graphs.prediction_graph.nodes['article'].data['h'] = embeddings['article'][0:graphs.prediction_graph.num_nodes('article')]
graphs.prediction_graph.nodes['customer'].data['h'] = embeddings['customer'][0:graphs.prediction_graph.num_nodes('customer')]


In [10]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7faf05afe6a0>

In [13]:
for _, pos_g, neg_g, blocks in dataloaders.dataloader_train_loss:
    break


pos_score, neg_score = model(pos_g, neg_g, graphs.history_graph, {
    'article': graphs.history_graph.nodes['article'].data['features'],
    'customer': graphs.history_graph.nodes['customer'].data['features'],
})

                                
pos_score = torch.nan_to_num(pos_score, 0)
neg_score = torch.nan_to_num(neg_score, 0)

loss = max_margin_loss(pos_score,
                neg_score,
                parameters=parameters,
                environment=environment
                )

loss.backward()
opt.step()
                    

Embeddings:  tensor([[0.0000, 0.0000, 0.0382,  ..., 0.0000, 0.0061, 0.0831],
        [0.0000, 0.0833, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0164, 0.0000, 0.0855,  ..., 0.0000, 0.0138, 0.1343],
        ...,
        [0.0000, 0.1210, 0.0000,  ..., 0.0711, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0666,  ..., 0.0253, 0.0000, 0.0834],
        [0.0000, 0.0000, 0.0502,  ..., 0.0000, 0.0074, 0.0970]],
       grad_fn=<SliceBackward0>)


In [9]:
model.train_loss_list = []
model.train_precision_list = []
model.train_recall_list = []
model.train_coverage_list = []
model.val_loss_list = []
model.val_precision_list = []
model.val_recall_list = []
model.val_coverage_list = []
best_metrics = {}  # For visualization
max_metric = -0.1
patience_counter = 0  # For early stopping
min_loss = 1.1

opt = parameters.optimizer(model.parameters(),
                            lr=parameters.lr)


### Entraînement (1 itération)

In [10]:
model.predict.to(environment.device)

CosinePrediction()

In [18]:
model.train()
opt.zero_grad()

embeddings = model(graphs.history_graph, {
    'article': graphs.history_graph.nodes['article'].data['features'],
    'customer': graphs.history_graph.nodes['customer'].data['features'],
})

In [19]:
graphs.prediction_graph.nodes['article'].data['h'] = embeddings['article'][0:graphs.prediction_graph.num_nodes('article')]
graphs.prediction_graph.nodes['customer'].data['h'] = embeddings['customer'][0:graphs.prediction_graph.num_nodes('customer')]

In [20]:
pos_score = torch.tensor([]).to(environment.device)
neg_score = torch.tensor([]).to(environment.device)

In [42]:
model.predict.to(environment.device)

CosinePrediction()

In [23]:
for _, pos_g, neg_g, blocks in dataloaders.dataloader_train_loss:
    print(blocks)
    break
    pos_g.to(environment.device)
    neg_g.to(environment.device)
    #
    pos_score = torch.cat([pos_score, model.predict(pos_g).to(environment.device)], dim = 0)
    #
    neg_g.nodes['article'].data['h'] = graphs.history_graph.nodes['article'].data['h'][neg_g.nodes['article'].data['_ID'].long()]
    neg_g.nodes['customer'].data['h'] = graphs.history_graph.nodes['customer'].data['h'][neg_g.nodes['customer'].data['_ID'].long()]
    #
    neg_score = torch.cat([neg_score, model.predict(neg_g).to(environment.device)], dim = 0)
    break


[Block(num_src_nodes={'article': 2937, 'customer': 3691},
      num_dst_nodes={'article': 2937, 'customer': 3691},
      num_edges={('customer', 'buys', 'article'): 0},
      metagraph=[('customer', 'article', 'buys')])]


In [44]:
pos_score = torch.nan_to_num(pos_score, 0)
neg_score = torch.nan_to_num(neg_score, 0)

In [45]:
loss = max_margin_loss(pos_score,
                           neg_score,
                           parameters=parameters,
                           environment=environment
                           )


In [46]:
loss.backward()
opt.step()

### Métrique

In [72]:
customers_per_batch = 200
current_index = 0
length = graphs.prediction_graph.num_nodes('customer')

precision_list = np.array([])
recommendation_chunks = []

while current_index < length :
    
    print(f"\rProcessing recommendations for customers {current_index} - {current_index + customers_per_batch}", end = "")
    new_recommendations = get_recommendation_tensor({
        'article': graphs.prediction_graph.nodes['article'].data['h'].to(environment.device),
        'customer': graphs.prediction_graph.nodes['customer'].data['h'][current_index: current_index + customers_per_batch].to(environment.device),
    }, parameters, environment)
    
    recommendation_chunks.append(new_recommendations)

    customer_nids = range(current_index, current_index + customers_per_batch)


    if current_index % 5000 == 0:
        recommendations = torch.cat(recommendation_chunks, dim = 0)
        
        precision = precision_at_k(recommendations, customer_nids, dataset)
        precision_list = np.append(precision_list, precision)
        
        recommendation_chunks = []
    
    current_index += customers_per_batch
    
precision = np.mean(precision_list)


Processing recommendations for customers 25400 - 25600

KeyboardInterrupt: 

### Compilation des recommandations

In [4]:
parameters.embedding_on_full_set = True

In [5]:
dataset = Dataset(environment, parameters)

In [7]:
from src.classes.graphs import Graphs

graphs = Graphs(dataset)

In [15]:
from src.classes.dataloaders import DataLoaders

dataloaders = DataLoaders(graphs,
                    dataset,
                    parameters,
                    environment
                    )

In [31]:
for input_nodes, output_nodes, blocks in dataloaders.dataloader_embedding:
    embeddings = model.forward(blocks, blocks[0].srcdata['features'])
    
    if 'customer' in output_nodes.keys():
        graphs.history_graph.nodes['customer'].data['h'][output_nodes['customer'].long()] = embeddings['customer']
    
    if 'article' in output_nodes.keys():
        graphs.history_graph.nodes['article'].data['h'][output_nodes['article'].long()] = embeddings['article']
        
    break

layer 0
torch.Size([1000, 256])
torch.Size([3195, 256])
layer 1
torch.Size([1000, 256])
torch.Size([0, 256])
layer 2
torch.Size([1000, 256])
torch.Size([0, 256])
layer 3
torch.Size([1000, 128])
torch.Size([0, 128])
