In [1]:

import irt_mt_dev.utils as utils
import numpy as np
import os
import subset2evaluate.evaluate
import subset2evaluate.select_subset
import tqdm

os.chdir("/home/vilda/irt-mt-dev")


def benchmark_method(repetitions=10, kwargs_dict={}):
	data_old = utils.load_data_wmt("wmt23", "en-cs", normalize=True)
	points_y_acc = []
	points_y_clu = []

	# run multiple times to smooth variance
	for _ in range(repetitions):
		(_, clu_new), acc_new = subset2evaluate.evaluate.run_evaluate_topk(
			data_old,
			subset2evaluate.select_subset.run_select_subset(data_old, **kwargs_dict),
			metric="human"
		)
		points_y_acc.append(acc_new)
		points_y_clu.append(clu_new)

		print(f"- ACC: {np.average(acc_new):.2%} | CLU: {np.average(clu_new):.2f}")
	print(f"ACC: {np.average(points_y_acc):.2%} | CLU: {np.average(points_y_clu):.2f}")

In [12]:
print("Random")
benchmark_method(repetitions=50, kwargs_dict={"method": "random"})

Random
- ACC: 93.76% | CLU: 4.78
- ACC: 89.63% | CLU: 4.33
- ACC: 93.33% | CLU: 4.67
- ACC: 94.29% | CLU: 4.33
- ACC: 93.76% | CLU: 4.33
- ACC: 92.91% | CLU: 5.00
- ACC: 91.32% | CLU: 5.11
- ACC: 93.12% | CLU: 4.67
- ACC: 95.34% | CLU: 4.56
- ACC: 93.97% | CLU: 4.89
- ACC: 92.49% | CLU: 4.44
- ACC: 90.79% | CLU: 4.67
- ACC: 93.44% | CLU: 4.44
- ACC: 92.80% | CLU: 5.00
- ACC: 94.07% | CLU: 4.67
- ACC: 92.17% | CLU: 4.78
- ACC: 93.86% | CLU: 4.11
- ACC: 95.34% | CLU: 4.67
- ACC: 93.86% | CLU: 4.33
- ACC: 91.01% | CLU: 4.00
- ACC: 91.32% | CLU: 4.67
- ACC: 89.84% | CLU: 4.22
- ACC: 91.64% | CLU: 4.22
- ACC: 92.70% | CLU: 5.22
- ACC: 94.81% | CLU: 4.67
- ACC: 92.28% | CLU: 4.56
- ACC: 92.70% | CLU: 4.11
- ACC: 93.02% | CLU: 4.78
- ACC: 91.32% | CLU: 4.67
- ACC: 92.91% | CLU: 4.56
- ACC: 94.60% | CLU: 4.78
- ACC: 92.17% | CLU: 4.67
- ACC: 93.02% | CLU: 5.11
- ACC: 94.29% | CLU: 4.67
- ACC: 92.28% | CLU: 4.22
- ACC: 91.11% | CLU: 4.78
- ACC: 93.12% | CLU: 4.56
- ACC: 93.12% | CLU: 5.00
- ACC

In [13]:
print("MetricX-23 avg")
benchmark_method(repetitions=1, kwargs_dict={"method": "avg", "metric": "MetricX-23"})
print("MetricX-23 var")
benchmark_method(repetitions=1, kwargs_dict={"method": "var", "metric": "MetricX-23"})

MetricX-23 avg
- ACC: 92.70% | CLU: 5.67
ACC: 92.70% | CLU: 5.67
MetricX-23 var
- ACC: 93.54% | CLU: 5.44
ACC: 93.54% | CLU: 5.44


In [2]:
# done with early stopping
print("IRT Fisher Information Content")
benchmark_method(repetitions=5, kwargs_dict={"method": "irt_fic", "metric": "MetricX-23", "model": "scalar"})

IRT Fisher Information Content
- ACC: 93.12% | CLU: 4.33
- ACC: 94.50% | CLU: 5.11
- ACC: 93.54% | CLU: 4.89
- ACC: 93.97% | CLU: 5.67
- ACC: 94.50% | CLU: 5.33
ACC: 93.93% | CLU: 5.07


In [4]:
# done with our own implementation
# select by metric highest cluster count (metric)
print("IRT Fisher Information Content")
benchmark_method(repetitions=15, kwargs_dict={"method": "irt_fic", "metric": "MetricX-23", "model": "scalar"})

IRT Fisher Information Content
Best epoch was: 4
- ACC: 94.71% | CLU: 5.56
Best epoch was: 5
- ACC: 92.49% | CLU: 4.89
Best epoch was: 3
- ACC: 93.65% | CLU: 5.11
Best epoch was: 3
- ACC: 91.75% | CLU: 4.67
Best epoch was: 3
- ACC: 93.23% | CLU: 5.11
Best epoch was: 4
- ACC: 91.96% | CLU: 5.11
Best epoch was: 11
- ACC: 95.66% | CLU: 5.22
Best epoch was: 3
- ACC: 93.97% | CLU: 5.22
Best epoch was: 2
- ACC: 93.86% | CLU: 5.22
Best epoch was: 5
- ACC: 95.13% | CLU: 4.78
Best epoch was: 3
- ACC: 90.90% | CLU: 4.89
Best epoch was: 3
- ACC: 94.18% | CLU: 4.89
Best epoch was: 6
- ACC: 92.80% | CLU: 5.89
Best epoch was: 20
- ACC: 93.86% | CLU: 5.00
Best epoch was: 3
- ACC: 95.13% | CLU: 5.33
ACC: 93.55% | CLU: 5.13


In [2]:
# done with our own implementation
# select by metric highest accuracy (metrioc)
print("IRT Fisher Information Content")
benchmark_method(repetitions=10, kwargs_dict={"method": "irt_fic", "metric": "MetricX-23", "model": "scalar"})

IRT Fisher Information Content
Best validation step was: 12
- ACC: 94.18% | CLU: 4.89
Best validation step was: 1
- ACC: 94.07% | CLU: 5.22
Best validation step was: 0
- ACC: 92.80% | CLU: 4.67
Best validation step was: 0
- ACC: 92.91% | CLU: 4.89
Best validation step was: 0
- ACC: 95.34% | CLU: 4.44
Best validation step was: 1
- ACC: 91.96% | CLU: 5.11
Best validation step was: 19
- ACC: 94.07% | CLU: 5.33
Best validation step was: 0
- ACC: 92.17% | CLU: 4.56
Best validation step was: 1
- ACC: 93.65% | CLU: 4.78
Best validation step was: 13
- ACC: 92.80% | CLU: 5.00
ACC: 93.40% | CLU: 4.89


In [2]:
print("PyIRT Fisher Information Content")
benchmark_method(repetitions=3, kwargs_dict={"method": "pyirt_fic", "metric": "MetricX-23", "model": "scalar"})

IRT Fisher Information Content
Median 0.94


Output()

torch.Size([16470]) torch.Size([16470])


- ACC: 94.81% | CLU: 5.22
Median 0.94


Output()

torch.Size([16470]) torch.Size([16470])


- ACC: 95.13% | CLU: 5.33
Median 0.94


Output()

torch.Size([16470]) torch.Size([16470])


- ACC: 95.24% | CLU: 5.11
ACC: 95.06% | CLU: 5.22
