In [1]:
import os
GPU_id = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from preproc import *
from batchloader import *
from helpers import get_mean_reciprocal_rank, roc_auc_score

- In this notebook we want to benchmark the processing and training time for three diffrent models: 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section I </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - In the second <a href=#fastai_workflow> section II </a>, we are using the Fastai processing workflow to get the scores of the best model found in the section I.  We directly process and creat databunch from data_pair_all.pkl dataframe 

In [3]:
%load_ext snakeviz
# load snakeviz if you want to run profiling 

<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> 

### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader 

In [4]:
class BatchDataBunch(DataBunch):
    
    @classmethod
    def remove_tfm(cls, tfm:Callable)->None:
        "Remove `tfm` from `self.tfms`."
        if tfm in cls.tfms: cls.tfms.remove(tfm)
            
    @classmethod
    def add_tfm(cls,tfm:Callable)->None:
        "Add `tfm` to `self.tfms`."
        cls.tfms.append(tfm)

    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, 
                       size:int=None, **kwargs)->'BatchDataBunch':
        
        
        cls.tfms = listify(tfms)
        
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=bs)]
        
        if valid_ds is not None:
            cls.empty_val = False
        else:
            cls.empty_val = True
            
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls
    


- To use the new BatchDatabunch class, we have to build the following processed tensors ( using cudf)  : 
    - train : cat_tensor, cont_tensor, label_tensor 
    
    - valid : cat_tensor, cont_tensor, label_tensor 
    
    - test : cat_tensor, cont_tensor, label_tensor 
    
- The size of vocaublary of each categorical variable need to be known 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section II </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - The <a href=#fastai_workflow> third model </a> will use the Fastai processing workflow: Directly process and creat databunch from data_pair_all.pkl dataframe 

<h1> <center>  <a id=cudf_workflow> Test of Tabular Learner with CuDF workflow </a></center> </h1>

**N.B:** For this section, you need to define the new custom BatchDataBunch class, if not go back to <a href=#batchdatabunch> section 1 </a>

<h2> 1. <a id=first_model> First model: Tabular Data copied to cpu </a> </h2>

In [4]:
to_cpu = True 

<h3> <a id=cudf_proc> Processing: Definition of train, validation and test tensors </a></h3>

In [5]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = './parquet_data/data_pair_all'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order','item_count'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

read train used 8.57 seconds.
get variables names used 0.00 seconds.
processing train used 15.00 seconds.
read test used 15.70 seconds.
processing test used 10.14 seconds.
read valid used 11.06 seconds.
processing valid used 12.97 seconds.
The whole processing used 48.30 seconds.


<h3> Benchmark : Get the best (batch size, learning rate)</h3> 

- Fine tune the best couple (batch_size, lr) : The criterion used is the CrossEntropy loss function 
    - The range of batch sizes is : 4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200
    - The range of max learning rate was set w.r.t to the plot of the results of the Fastai method find_lr : [6e-2, 9e-2, 2e-1] 
    
    
- **N.B:** Some of the batch_sizes require more than one epoch to get the best score (numbers shown in the paper). However, to the complexity of the notebook, we'll run all the training with 1 epoch as our best model (fastest training time) converges in 1 epoch. 

In [6]:
batch_sizes = [4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200]
lrs = [6e-2, 9e-2, 2e-1] 

In [15]:
# Define batch databunch 
benchmark_results = [] 

for batch_size in batch_sizes: 
    train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
    validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
    test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
    databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   
    
    for learning_rate in lrs: 
        print('Launch training for the couple: lr: %s, bs: %s ' %(learning_rate, batch_size))
        #define the model 
        emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
                  (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

        model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])

        learn =  Learner(databunch, model, metrics=None)
        
        learn.loss_func = torch.nn.CrossEntropyLoss()
        
        # launch training 
        start = time()
        learn.fit_one_cycle(1, learning_rate)
        t_final = time() - start 
        benchmark_results.append([batch_size, learning_rate, learn.recorder.val_losses[0], 1, t_final] ) 
        del databunch 
        del learn 

Launch training for the couple: lr: 0.06, bs: 4096 


epoch,train_loss,valid_loss,time
0,0.113909,0.115567,02:03


Launch training for the couple: lr: 0.09, bs: 4096 


epoch,train_loss,valid_loss,time
0,0.114736,0.115734,02:02


Launch training for the couple: lr: 0.2, bs: 4096 


epoch,train_loss,valid_loss,time
0,0.115347,0.116804,02:04


Launch training for the couple: lr: 0.06, bs: 8192 


epoch,train_loss,valid_loss,time
0,0.111473,0.117125,01:24


Launch training for the couple: lr: 0.09, bs: 8192 


epoch,train_loss,valid_loss,time
0,0.111246,0.116392,01:21


Launch training for the couple: lr: 0.2, bs: 8192 


epoch,train_loss,valid_loss,time
0,0.113675,0.120475,01:24


Launch training for the couple: lr: 0.06, bs: 20480 


epoch,train_loss,valid_loss,time
0,0.110198,0.118517,01:08


Launch training for the couple: lr: 0.09, bs: 20480 


epoch,train_loss,valid_loss,time
0,0.1102,0.116293,01:07


Launch training for the couple: lr: 0.2, bs: 20480 


epoch,train_loss,valid_loss,time
0,0.111194,0.117983,01:07


Launch training for the couple: lr: 0.06, bs: 40960 


epoch,train_loss,valid_loss,time
0,0.110399,0.115086,01:04


Launch training for the couple: lr: 0.09, bs: 40960 


epoch,train_loss,valid_loss,time
0,0.110559,0.115627,01:04


Launch training for the couple: lr: 0.2, bs: 40960 


epoch,train_loss,valid_loss,time
0,0.110687,0.115821,01:04


Launch training for the couple: lr: 0.06, bs: 81920 


epoch,train_loss,valid_loss,time
0,0.113079,0.114969,01:02


Launch training for the couple: lr: 0.09, bs: 81920 


epoch,train_loss,valid_loss,time
0,0.112071,0.115964,01:01


Launch training for the couple: lr: 0.2, bs: 81920 


epoch,train_loss,valid_loss,time
0,0.112602,0.115869,01:02


Launch training for the couple: lr: 0.06, bs: 204800 


epoch,train_loss,valid_loss,time
0,0.127111,0.118641,00:18


Launch training for the couple: lr: 0.09, bs: 204800 


epoch,train_loss,valid_loss,time
0,0.125603,0.117659,00:17


Launch training for the couple: lr: 0.2, bs: 204800 


epoch,train_loss,valid_loss,time
0,0.124287,0.118379,00:18


Launch training for the couple: lr: 0.06, bs: 409600 


epoch,train_loss,valid_loss,time
0,0.153374,0.12298,00:17


Launch training for the couple: lr: 0.09, bs: 409600 


epoch,train_loss,valid_loss,time
0,0.167194,0.125484,00:17


Launch training for the couple: lr: 0.2, bs: 409600 


epoch,train_loss,valid_loss,time
0,0.15642,0.125905,00:17


Launch training for the couple: lr: 0.06, bs: 819200 


epoch,train_loss,valid_loss,time
0,0.206699,0.133658,00:18


Launch training for the couple: lr: 0.09, bs: 819200 


epoch,train_loss,valid_loss,time
0,0.206986,0.135217,00:17


Launch training for the couple: lr: 0.2, bs: 819200 


epoch,train_loss,valid_loss,time
0,0.188497,0.133362,00:17


In [19]:
results = pd.DataFrame(benchmark_results)
results.columns = ['batch size', 'learning rate', 'validation loss', 'N epochs', 'training time']

In [None]:
results.sort_values(by=['validation loss', 'training time'], ascending=True).head(10)

- **Conclusion** The best trade-off between training time and validation loss is reached for the couple **(204800, 0.09)**

<h3> Compute average validation scores of the best model </h3>

In [7]:
ds = pd.read_parquet("./parquet_data/data_pair_all/valid.parquet")

In [None]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 204800*50
best_lr = 9e-2

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=best_bs)   

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(databunch, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(databunch)
    cv = ds[['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    

In [None]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

<h2> 2. <a id=second_model> Second model: Tabular Data in-memory  </a> </h2>

In [5]:
to_cpu = False 

<h3> Processing: Definition of train, validation and test tensors </h3>

In [6]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = './parquet_data/data_pair_all'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order','item_count'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

read train used 7.95 seconds.
get variables names used 0.00 seconds.
processing train used 10.31 seconds.
read test used 10.84 seconds.
processing test used 9.87 seconds.
read valid used 10.77 seconds.
processing valid used 11.31 seconds.
The whole processing used 40.87 seconds.


<h3> Benchmark : Get the best (batch size, learning rate)</h3> 

- Fine tune the best couple (batch_size, lr) : The criterion used is the CrossEntropy loss function 
    - The range of batch sizes is : 4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200
    - The range of max learning rate was set w.r.t to the plot of the results of the Fastai method find_lr : [6e-2, 9e-2, 2e-1] 
    
    
- **N.B:** Some of the batch_sizes require more than one epoch to get the best score (numbers shown in the paper). However, to the complexity of the notebook, we'll run all the training with 1 epoch as our best model (fastest training time) converges in 1 epoch. 

In [12]:
batch_sizes = [4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200]
lrs = [6e-2, 9e-2, 2e-1] 

In [None]:
# Define batch databunch 
from fastai.tabular import TabularModel
from time import time 
benchmark_results = [] 

for batch_size in batch_sizes: 
    train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
    validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
    test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
    databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   
    del train 
    del validation 
    del test
    for learning_rate in lrs: 
        #define the model 
        emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
                  (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

        model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])

        learn =  Learner(databunch, model, metrics=None)
        
        learn.loss_func = torch.nn.CrossEntropyLoss()
        
        # launch training 
        start = time()
        learn.fit_one_cycle(1, learning_rate)
        t_final = time() - start 
        benchmark_results.append([batch_size, learning_rate, learn.recorder.val_losses[0], 1, t_final] ) 
        print('training for the couple: lr: %s, bs: %s used %.2f' %(learning_rate, batch_size, t_final))

        del learn 
        del model
        torch.cuda.empty_cache()   
        
    del databunch
    torch.cuda.empty_cache()   

In [17]:
results = pd.DataFrame(benchmark_results)
results.columns = ['batch size', 'learning rate', 'validation loss', 'N epochs', 'training time']

In [19]:
results.sort_values(by=['training time', 'validation loss'], ascending=True).head(10)

Unnamed: 0,batch size,learning rate,validation loss,N epochs,training time
20,409600,0.2,0.12463,1,15.040216
19,409600,0.09,0.123395,1,15.07277
23,819200,0.2,0.132649,1,15.116976
18,409600,0.06,0.124982,1,15.312722
22,819200,0.09,0.139202,1,15.316487
21,819200,0.06,0.131654,1,15.42894
17,204800,0.2,0.12075,1,15.503947
16,204800,0.09,0.117699,1,15.556973
15,204800,0.06,0.117534,1,16.00941
13,81920,0.09,0.115615,1,57.661088


**Conclusion** The best trade-off between training time and validation loss is reached for the couple **(204800, 0.09)**

<h3> Compute average validation scores of the best model </h3>

In [7]:
from helpers import get_mean_reciprocal_rank, roc_auc_score
ds = pd.read_parquet("./parquet_data/data_pair_all/valid.parquet")

In [8]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

# create databunch 
train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=best_bs)   

del train 
del validation 
del test

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  

# run the model 5 times to get 5 scores 

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(databunch, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(databunch)
    cv = ds[['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    
    del model 
    del learn
    torch.cuda.empty_cache()
    

In [9]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

the mrr of the best mdodel is: 0.6122096267815588 +/- 0.0007342691916551865
the auc of the best mdodel is: 0.8786870469761917 +/- 0.002244973682607096
the best mdodel's training time is 16.11651215553284 +/- 0.1822731968083248


<h1> <center>  <a id=fastai_workflow> Test of Tabular Learner with Fastai workflow </a></center> </h1>

- As the processing time is taking more than 6minutes and our purpose is to benchmark the best model using our proposed workflow against the Fastai workflow. We'll directly compute the scores of the Tabular model with batch size of 204800 and learning rate 0.09 

In [3]:
batch_size = 4096*50

In [4]:
from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from batchloader import *

In [5]:
%%time
import pandas as pd
data_pair = pd.read_pickle('/rapids/notebooks/jperez/recsys/cache/data_pair_all.pkl')

CPU times: user 11.2 s, sys: 16.9 s, total: 28 s
Wall time: 28 s


<h3> Create pre-processed databunch </h3> 

In [6]:
%%time
# split to train / test 
train = data_pair[data_pair.clickout_missing==0]
test = data_pair[data_pair.clickout_missing>0]
print(train.shape,test.shape)

# get categorical and continious variables names 
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')]
cont_names = ['price','candidate_order','item_count'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]

# define validation rows
train['is_va'] = train.row_id%5 == 0
del data_pair

(42756036, 46) (5762533, 46)
CPU times: user 8.38 s, sys: 14.2 s, total: 22.6 s
Wall time: 22.5 s


In [7]:
%%time
procs = [FillMissing, Categorify, Normalize]

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=8,bs=batch_size, device='cuda'))

CPU times: user 3min 30s, sys: 2min 45s, total: 6min 16s
Wall time: 6min 2s


<h3> Compute average validation scores of the best model  </h3> 

In [None]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(data, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(data)
    cv = train.loc[train['is_va']>0,['row_id','reference','item_id', 'target']].copy(0)
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    

epoch,train_loss,valid_loss,time


In [None]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))