In [1]:

import irt_mt_dev.utils as utils
import numpy as np
import os
import subset2evaluate.evaluate
import subset2evaluate.select_subset
import tqdm

os.chdir("/home/vilda/irt-mt-dev")

def benchmark_method(repetitions=10, kwargs_dict={}):
    data_old_all = list(utils.load_data_wmt_all(normalize=True).values())[:9]
    points_y_acc = []
    points_y_clu = []

    # run multiple times to smooth variance
    for data_old in data_old_all:
        for _ in range(repetitions):
            (_, clu_new), acc_new = subset2evaluate.evaluate.run_evaluate_topk(
                data_old,
                subset2evaluate.select_subset.run_select_subset(data_old, **kwargs_dict),
                metric="human",
                retry_on_error=True,
            )
            points_y_acc.append(acc_new)
            points_y_clu.append(clu_new)

        # print(f"- ACC: {np.average(acc_new):.2%} | CLU: {np.average(clu_new):.2f}")
    print(f"ACC: {np.average(points_y_acc):.2%} | CLU: {np.average(points_y_clu):.2f}")

In [2]:
print("Random")
benchmark_method(repetitions=10, kwargs_dict={"method": "random"})

# ACC: 91.65% | CLU: 2.42

Random
ACC: 91.65% | CLU: 2.42


In [3]:
print("MetricX-23 avg")
benchmark_method(repetitions=1, kwargs_dict={"method": "avg", "metric": "MetricX-23"})
print("MetricX-23 var")
benchmark_method(repetitions=1, kwargs_dict={"method": "var", "metric": "MetricX-23"})

# MetricX-23 avg
# ACC: 92.41% | CLU: 3.32
# MetricX-23 var
# ACC: 92.59% | CLU: 3.43

MetricX-23 avg
ACC: 92.41% | CLU: 3.32
MetricX-23 var
ACC: 92.59% | CLU: 3.43


In [4]:
print("IRT Fisher Information Content")
benchmark_method(repetitions=3, kwargs_dict={"method": "irt_fic", "metric": "MetricX-23", "model": "scalar"})

# ACC: 91.48% | CLU: 2.86

IRT Fisher Information Content
Best validation step was: 0
Best validation step was: 2
Best validation step was: 3
Best validation step was: 0
Best validation step was: 3
Best validation step was: 0
Best validation step was: 0
Best validation step was: 26
Best validation step was: 14
Best validation step was: 7
Best validation step was: 1
Best validation step was: 0
Best validation step was: 0
Best validation step was: 1
Best validation step was: 1
Best validation step was: 22
Best validation step was: 0
Best validation step was: 18
Best validation step was: 1
Best validation step was: 1
Best validation step was: 4
Best validation step was: 1
Best validation step was: 1
Best validation step was: 0
Best validation step was: 0
Best validation step was: 2
Best validation step was: 1
ACC: 91.48% | CLU: 2.86


In [5]:
print("PyIRT Fisher Information Content")

benchmark_method(repetitions=3, kwargs_dict={"method": "pyirt_fic", "metric": "MetricX-23", 'deterministic': True, 'epochs': 1000, 'model_type': '4pl', 'dropout': 0.5, 'priors': 'hiearchical'})

# ACC: 93.13% | CLU: 3.04

PyIRT Fisher Information Content


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

ACC: 93.13% | CLU: 3.04


In [6]:
print("PyIRT Fisher Information Content with Scoring Model")

benchmark_method(repetitions=3, kwargs_dict={"method": "pyirt_fic", "metric": "MetricX-23", 'deterministic': True, 'epochs': 1000, 'model_type': '4pl_score', 'dropout': 0.5, 'priors': 'hiearchical'})

# ACC: 92.24% | CLU: 3.34
# ACC: 91.92% | CLU: 3.35

PyIRT Fisher Information Content with Scoring Model


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Expected parameter concentration (Tensor of shape ()) of distribution Gamma(concentration: nan, rate: nan) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
nan
 Trace Shapes:    
  Param Sites:    
      loc_mu_b    
    scale_mu_b    
  loc_mu_gamma    
scale_mu_gamma    
  loc_mu_theta    
scale_mu_theta    
       alpha_b    
        beta_b    
   alpha_gamma    
    beta_gamma    
     alpha_obs    
      beta_obs    
   alpha_theta    
    beta_theta    
   loc_ability  12
 scale_ability  12
      loc_diff 549
    scale_diff 549
      loc_disc 549
    scale_disc 549
 Sample Sites:    
     mu_b dist   |
         value   |
      u_b dist   |
         value   |
 mu_gamma dist   |
         value   |
  u_gamma dist   |
         value   |
    u_obs dist   |
         value   |
 mu_theta dist   |
         value   |


Output()

Output()

Expected parameter concentration (Tensor of shape ()) of distribution Gamma(concentration: nan, rate: nan) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
nan
 Trace Shapes:    
  Param Sites:    
      loc_mu_b    
    scale_mu_b    
  loc_mu_gamma    
scale_mu_gamma    
  loc_mu_theta    
scale_mu_theta    
       alpha_b    
        beta_b    
   alpha_gamma    
    beta_gamma    
     alpha_obs    
      beta_obs    
   alpha_theta    
    beta_theta    
   loc_ability  12
 scale_ability  12
      loc_diff 549
    scale_diff 549
      loc_disc 549
    scale_disc 549
 Sample Sites:    
     mu_b dist   |
         value   |
      u_b dist   |
         value   |
 mu_gamma dist   |
         value   |
  u_gamma dist   |
         value   |
    u_obs dist   |
         value   |
 mu_theta dist   |
         value   |


Output()

Output()

Output()

Output()

Output()

Output()

Expected parameter concentration (Tensor of shape ()) of distribution Gamma(concentration: nan, rate: nan) to satisfy the constraint GreaterThan(lower_bound=0.0), but found invalid values:
nan
 Trace Shapes:     
  Param Sites:     
      loc_mu_b     
    scale_mu_b     
  loc_mu_gamma     
scale_mu_gamma     
  loc_mu_theta     
scale_mu_theta     
       alpha_b     
        beta_b     
   alpha_gamma     
    beta_gamma     
     alpha_obs     
      beta_obs     
   alpha_theta     
    beta_theta     
   loc_ability   15
 scale_ability   15
      loc_diff 1098
    scale_diff 1098
      loc_disc 1098
    scale_disc 1098
 Sample Sites:     
     mu_b dist    |
         value    |
      u_b dist    |
         value    |
 mu_gamma dist    |
         value    |


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

ACC: 91.92% | CLU: 3.35
