# Evaluate Models

In [1]:
import leaderbot as lb
import numpy as np
import pickle

In [2]:
data = lb.data.load()
training_data, test_data = lb.data.split(data, test_ratio=0.1, seed=20)

In [3]:
from copy import deepcopy

# Create a copy of the data with no tie (for Bradley-Terry)
training_data_no_tie = deepcopy(training_data)
test_data_no_tie = deepcopy(test_data)

training_data_no_tie['Y'][:, -1] = 0
test_data_no_tie['Y'][:, -1] = 0

In [6]:
# Small set of models
n_tie = 0

models = [
    lb.models.BradleyTerry(training_data),
    lb.models.RaoKupper(training_data, n_tie_factors=n_tie),
    lb.models.Davidson(training_data, n_tie_factors=n_tie)
]

In [13]:
# Small set of models with tie factor
n_tie = 1
n_cov = 2

models = [
    lb.models.BradleyTerryFactor(training_data, n_cov_factors=n_cov),
    lb.models.RaoKupperFactor(training_data, n_cov_factors=n_cov, n_tie_factors=n_tie),
    lb.models.DavidsonFactor(training_data, n_cov_factors=n_cov, n_tie_factors=n_tie)
]

In [5]:
# More set of models
n_tie = 0
n_cov = 0

models = [
    lb.models.BradleyTerry(training_data_no_tie),
    lb.models.BradleyTerryScaled(training_data_no_tie),
    lb.models.BradleyTerryScaledR(training_data_no_tie),
    # lb.models.BradleyTerryScaledRIJ(training_data, n_tie_factors=n_tie),
    lb.models.BradleyTerryFactor(training_data_no_tie, n_cov_factors=n_cov),
    
    lb.models.RaoKupper(training_data, n_tie_factors=n_tie),
    lb.models.RaoKupperScaled(training_data, n_tie_factors=n_tie),
    lb.models.RaoKupperScaledR(training_data, n_tie_factors=n_tie),
    # lb.models.RaoKupperScaledRIJ(dtraining_data, n_tie_factors=n_tie),
    lb.models.RaoKupperFactor(training_data, n_cov_factors=n_cov, n_tie_factors=n_tie),
    
    lb.models.Davidson(training_data, n_tie_factors=n_tie),
    lb.models.DavidsonScaled(training_data, n_tie_factors=n_tie),
    lb.models.DavidsonScaledR(training_data, n_tie_factors=n_tie),
    # lb.models.DavidsonScaledRIJ(training_data, n_tie_factors=n_tie),
    lb.models.DavidsonFactor(training_data, n_cov_factors=n_cov, n_tie_factors=n_tie)
]

In [6]:
# Train all. If model is ScaledRIJ, use l-BFGS-B, otherwise, use BFGS.
for model in models:

    name = model.__class__.__name__
    if name.endswith('ScaledRIJ'):
        method = 'L-BFGS-B'
    else:
        method = 'BFGS'

    print(f'training {name:<21} ... ', end='')
    model.train()
    print('done.')

training BradleyTerry          ... done.
training BradleyTerryScaled    ... done.
training BradleyTerryScaledR   ... done.
training BradleyTerryFactor    ... done.
training RaoKupper             ... done.
training RaoKupperScaled       ... done.
training RaoKupperScaledR      ... done.
training RaoKupperFactor       ... done.
training Davidson              ... done.
training DavidsonScaled        ... done.
training DavidsonScaledR       ... done.
training DavidsonFactor        ... done.


In [2]:
# Load form file instead
filename = '../benchmark/models_train_full.pkl'
#filename = '../benchmark/models_train_split.pkl'
with open(filename, 'rb') as f:
    results = pickle.load(f)

models = results['models']
test_data = results['test_data']
test_data_no_tie = results['test_data_no_tie']
proc_time = np.array(results['proc_time'])
wall_time = np.array(results['wall_time'])
print(results['device'])
print(results['num_proc'])

x86_64
128


In [3]:
for i in range(proc_time.size):
    # print(f'{proc_time[i]:>7.1f}  {wall_time[i]:>7.1f} ')
    print(f'{wall_time[i]:>7.1f} ')

    2.3 
    4.5 
   61.0 
    0.0 
    2.7 
   27.4 
    5.8 
    6.4 
  195.0 
  330.2 
   10.5 
    7.8 
  285.3 
  652.2 
   36.0 
   37.9 
  340.6 
  762.6 
    5.9 
    6.5 
   91.5 
  606.5 
    8.0 
   10.5 
  278.7 
  648.4 
   35.4 
   35.3 
  349.1 
  803.8 


In [3]:
# Model selection
metrics = lb.evaluate.model_selection(models, train=False, tie=False, report=True)

+----+-----------------------+---------+--------+--------------------------------+---------+---------+
|    |                       |         |        |               CEL              |         |         |
| id | model                 | # param |    NLL |    all     win    loss     tie |     AIC |     BIC |
+----+-----------------------+---------+--------+--------------------------------+---------+---------+
|  1 | BradleyTerry          |     129 | 0.6554 | 0.6553  0.3177  0.3376     inf |   256.7 |  1049.7 |
|  2 | BradleyTerryScaled    |     258 | 0.6552 | 0.6551  0.3180  0.3371     inf |   514.7 |  2100.8 |
|  3 | BradleyTerryFactor    |     645 | 0.6549 | 0.6548  0.3178  0.3370     inf |  1288.7 |  5253.9 |
|  4 | BradleyTerry          |     129 | 0.6351 | 0.6351  0.3056  0.3295     inf |   256.7 |  1049.8 |
|  5 | BradleyTerryScaled    |     258 | 0.6346 | 0.6346  0.3059  0.3287     inf |   514.7 |  2100.8 |
|  6 | BradleyTerryFactor    |     645 | 0.6342 | 0.6342  0.3057  0.3285 

In [4]:
# Goodness of fit. This evaluates how well the model fits training data.
metrics = lb.evaluate.goodness_of_fit(models, train=False, tie=False, metric='RMSE', 
                                      density=False, report=True)

+----+-----------------------+----------------------------+------+------+
|    |                       |            RMSE            |      |      |
| id | model                 |   win   loss    tie    all | KLD% | JSD% |
+----+-----------------------+----------------------------+------+------+
|  1 | BradleyTerry          |  29.7   29.7  -----   29.7 | 1.49 | 0.44 |
|  2 | BradleyTerryScaled    |  26.2   26.2  -----   26.2 | 1.42 | 0.42 |
|  3 | BradleyTerryFactor    |  17.4   17.4  -----   17.4 | 1.30 | 0.39 |
|  4 | BradleyTerry          |  35.1   35.1  -----   35.1 | 1.82 | 0.52 |
|  5 | BradleyTerryScaled    |  31.5   31.5  -----   31.5 | 1.71 | 0.49 |
|  6 | BradleyTerryFactor    |  17.3   17.3  -----   17.3 | 1.58 | 0.46 |
|  7 | RaoKupper             |  48.2   69.9  103.5   77.3 | 3.32 | 0.92 |
|  8 | RaoKupper             |  46.4   67.8   99.2   74.3 | 3.45 | 0.91 |
|  9 | RaoKupper             |  34.1   34.2   23.1   30.9 | 2.63 | 0.73 |
| 10 | RaoKupper             |  34.3  

In [9]:
# Generalization. This evaluates how well the model predicts test data.
metrics = lb.evaluate.generalization(models, test_data=test_data, train=False,
                                     tie=False, metric='RMSE', density=False,
                                     report=True)

+----+-----------------------+----------------------------+------+------+
|    |                       |            RMSE            |      |      |
| id | model                 |   win   loss    tie    all | KLD% | JSD% |
+----+-----------------------+----------------------------+------+------+
|  1 | BradleyTerry          |  29.7   29.7  -----   29.7 | 1.49 | 0.44 |
|  2 | BradleyTerryScaled    |  26.2   26.2  -----   26.2 | 1.42 | 0.42 |
|  3 | BradleyTerryFactor    |  17.4   17.4  -----   17.4 | 1.30 | 0.39 |
|  4 | BradleyTerry          |  62.0   62.0  -----   62.0 | 1.72 | 0.49 |
|  5 | BradleyTerryScaled    |  64.5   64.5  -----   64.5 | 1.65 | 0.47 |
|  6 | BradleyTerryFactor    |  74.0   74.0  -----   74.0 | 1.55 | 0.44 |
|  7 | RaoKupper             |  48.2   69.9  103.5   77.3 | 3.32 | 0.92 |
|  8 | RaoKupper             |  46.4   67.8   99.2   74.3 | 3.45 | 0.91 |
|  9 | RaoKupper             |  34.1   34.2   23.1   30.9 | 2.63 | 0.73 |
| 10 | RaoKupper             |  34.3  