## Win Accuracy Benchmarks
This is a reproducible benchmark that checks the accuracy of [openskill.py](https://openskill.me/) against a dataset of OverWatch matches.

Let's start by importing the necessary libraries:

In [1]:
%pip install rbo rich numpy tqdm pooch jsonlines trueskill scipy ipywidgets openskill

Collecting rbo
  Obtaining dependency information for rbo from https://files.pythonhosted.org/packages/f0/b3/aa1923e0ed19ecf190f7e8d9fe939f9020dd601b64e190b1f58b3692be8e/rbo-0.1.3-py3-none-any.whl.metadata
  Downloading rbo-0.1.3-py3-none-any.whl.metadata (4.6 kB)
Collecting jsonlines
  Obtaining dependency information for jsonlines from https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl.metadata
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting openskill
  Obtaining dependency information for openskill from https://files.pythonhosted.org/packages/93/e3/1db14897e93d38dd9e7ec564d0428959a8ff5522e28dc825f6466a4532f8/openskill-6.0.0-py3-none-any.whl.metadata
  Downloading openskill-6.0.0-py3-none-any.whl.metadata (6.7 kB)
Downloading rbo-0.1.3-py3-none-any.whl (7.8 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Downloading openskill-6.0.0-py3-none-any.whl

## Import Libraries
We going to import jsonlines to do the data loading and rich to display things in user friendly format.
We are also going to import ipywidget for native iteractivity within the browser.

In [2]:
import gc
import itertools
import math
import time
from pathlib import Path

import ipywidgets as widgets
import jsonlines
import rich
import trueskill
from pooch import DOIDownloader
from rbo import rbo
from rich.table import Table
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from trueskill import TrueSkill

import openskill
from openskill.models import (
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
)

## Check OpenSkill Version

In [3]:
print(openskill.__version__)

6.0.0


## Download Data
The data we need is not available, yet. So let's download it.
It's available [here](https://zenodo.org/records/10342317).


In [4]:
downloader = DOIDownloader(progressbar=True)

working_directory = Path.cwd()
data_directory = Path(working_directory / Path("data"))
data_directory.mkdir(exist_ok=True)
downloader(
    url="doi:10.5281/zenodo.10359600/overwatch.jsonl",
    output_file=data_directory / "overwatch.jsonl",
    pooch=None,
)

100%|█████████████████████████████████████| 63.6M/63.6M [00:00<00:00, 40.5GB/s]


## Select Model
OpenSkill comes with 5 models. Let's pick one for this benchmark.

In [5]:
models = [
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
]

widget = widgets.Select(
    options=[m.__name__ for m in models],
    value=PlackettLuce.__name__,
    description="Model:",
    disabled=False,
)
display(widget)

Select(description='Model:', index=2, options=('BradleyTerryFull', 'BradleyTerryPart', 'PlackettLuce', 'Thurst…

## Initialize Model
Let's call the constructor on the model.

In [6]:
widget.close()
m = widget.value
m = eval(m)
model = m(balance=False)
print(str(model))

Plackett-Luce Model Parameters: 

mu: 25.0
sigma: 8.333333333333334



## Load Data
We are going to use the Polars library to lazily scan the parquet files.

In [7]:
# Load Data
data = list(jsonlines.open(data_directory / "overwatch.jsonl").iter())

Let's also hold all rating data in a single variable:

In [8]:
# Data Container
openskill_players = {}
trueskill_players = {}

verified_matches = []
verified_test_set = []
training_set = []
test_set = []
match_count = {}

available_matches = 0
valid_matches = 0
openskill_correct_predictions = 0
openskill_incorrect_predictions = 0
trueskill_correct_predictions = 0
trueskill_incorrect_predictions = 0

openskill_time = None
trueskill_time = None

# Constants
Let's configure some settings.

In [9]:
MINIMUM_MATCHES = 2
SEED = 1

## Process Data for Split
Let's now process the data and split it.

In [10]:
print("Loading Raw Data into Memory")

# Process Data
print("Processing Data:")

data_size = len(data)


for match in data:
    # Check if match is valid and count matches
    result = match.get("result")
    if result not in ["WIN", "LOSS"]:
        continue

    teams: dict = match.get("teams")
    if list(teams.keys()) != ["blue", "red"]:
        continue

    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    if len(blue_team) < 1 and len(red_team) < 1:
        continue

    for player in blue_team:
        match_count[player] = match_count.get(player, 0) + 1

    for player in red_team:
        match_count[player] = match_count.get(player, 0) + 1


for match in tqdm(data):
    # Throw out invalid matches
    result = match.get("result")
    if result not in ["WIN", "LOSS"]:
        continue

    teams: dict = match.get("teams")
    if list(teams.keys()) != ["blue", "red"]:
        continue

    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    if len(blue_team) < 1 and len(red_team) < 1:
        continue

    invalid = False
    for player in blue_team:
        if match_count[player] < MINIMUM_MATCHES:
            invalid = True

    for player in red_team:
        if match_count[player] < MINIMUM_MATCHES:
            invalid = True

    if invalid:
        continue

    available_matches += 1

    verified_matches.append(match)

print(f"Parsed {len(verified_matches)} Training Matches")
_ = gc.collect()

Loading Raw Data into Memory
Processing Data:


  0%|          | 0/61867 [00:00<?, ?it/s]

Parsed 5661 Training Matches


Now let's split the data.

In [11]:
# Split Data
train, test = train_test_split(verified_matches, test_size=0.33, random_state=SEED)

# Parse Data for OpenSkill Training Set
Let's now parse the training data for OpenSkill.

In [12]:
print("Loading Raw Data from Training Set into Memory")

# Parse Training Data
print("Parsing Training Data:")

# Get Unique Matches for Training
train_size = len(train)

# Create a Progress Bar
t = tqdm(total=train_size)

os_process_time_start = time.time()

for match in train:
    result = match.get("result")
    won = True if result == "WIN" else False

    teams: dict = match.get("teams")
    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    os_blue_players = {}
    os_red_players = {}

    m = model
    r = m.rating

    for player in blue_team:
        os_blue_players[player] = openskill_players.setdefault(player, r())

    for player in red_team:
        os_red_players[player] = openskill_players.setdefault(player, r())

    if won:
        blue_team_result, red_team_result = m.rate(
            [list(os_blue_players.values()), list(os_red_players.values())],
            ranks=[0, 1],
        )
    else:
        red_team_result, blue_team_result = m.rate(
            [list(os_red_players.values()), list(os_blue_players.values())],
            ranks=[0, 1],
        )

    os_blue_players = dict(zip(os_blue_players, blue_team_result))
    os_red_players = dict(zip(os_red_players, red_team_result))

    openskill_players.update(os_blue_players)
    openskill_players.update(os_red_players)

    t.update(1)

os_process_time_stop = time.time()
openskill_time = os_process_time_stop - os_process_time_start

print(f"Parsed Training Matches")
_ = gc.collect()

Loading Raw Data from Training Set into Memory
Parsing Training Data:


  0%|          | 0/3792 [00:00<?, ?it/s]

Parsed Training Matches


# Parse Data for TrueSkill Training Set
Let's now parse the training data for TrueSkill.

In [13]:
print("Loading Raw Data from Training Set into Memory")

# Parse Training Data
print("Parsing Training Data:")

# Get Unique Matches for Training
train_size = len(train)

# Create a Progress Bar
t = tqdm(total=train_size)

ts_process_time_start = time.time()

# Set backend here to test with scipy.
TrueSkill = TrueSkill()

for match in train:
    result = match.get("result")
    won = True if result == "WIN" else False

    teams: dict = match.get("teams")
    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    ts_blue_players = {}
    ts_red_players = {}

    for player in blue_team:
        ts_blue_players[player] = trueskill_players.setdefault(
            player, trueskill.Rating()
        )

    for player in red_team:
        ts_red_players[player] = trueskill_players.setdefault(
            player, trueskill.Rating()
        )

    if won:
        blue_team_ratings, red_team_ratings = TrueSkill.rate(
            [list(ts_blue_players.values()), list(ts_red_players.values())],
        )
    else:
        red_team_ratings, blue_team_ratings = TrueSkill.rate(
            [list(ts_red_players.values()), list(ts_blue_players.values())]
        )

    ts_blue_players = dict(zip(ts_blue_players, blue_team_ratings))
    ts_red_players = dict(zip(ts_red_players, red_team_ratings))

    trueskill_players.update(ts_blue_players)
    trueskill_players.update(ts_red_players)

    t.update(1)

ts_process_time_stop = time.time()
trueskill_time = ts_process_time_stop - ts_process_time_start

print(f"Parsed Training Matches")
_ = gc.collect()

Loading Raw Data from Training Set into Memory
Parsing Training Data:


  0%|          | 0/3792 [00:00<?, ?it/s]

Parsed Training Matches


# Process Test Set
We need to load only valid test tests.

In [14]:
print("Loading Raw Data from Test Set into Memory")

# Parse Test Set
print("Parsing Test Data:")

# Get Unique Matches for Training
test_size = len(test)

# Create a Progress Bar
t = tqdm(total=test_size)

for match in test:
    teams: dict = match.get("teams")
    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    invalid = False
    for player in blue_team:
        if player not in openskill_players:
            invalid = True

    for player in red_team:
        if player not in openskill_players:
            invalid = True

    t.update(1)

    if invalid:
        continue

    verified_test_set.append(match)
    valid_matches += 1

print(f"Parsed Test Matches")
_ = gc.collect()

Loading Raw Data from Test Set into Memory
Parsing Test Data:


  0%|          | 0/1869 [00:00<?, ?it/s]

Parsed Test Matches


## Predict Matches using OpenSkill
We shall use `predict_win` to try to predict matches in the test set.

In [15]:
print("Loading Raw Data from Test Set into Memory")

# Parse Test Set
print("Predicting Test Data:")

# Get Unique Matches for Training
test_size = len(verified_test_set)

# Create a Progress Bar
t = tqdm(total=test_size)

for match in verified_test_set:
    result = match.get("result")
    won = True if result == "WIN" else False

    teams: dict = match.get("teams")
    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    os_blue_players = {}
    os_red_players = {}

    for player in blue_team:
        os_blue_players[player] = openskill_players[player]

    for player in red_team:
        os_red_players[player] = openskill_players[player]

    m = model

    blue_win_probability, red_win_probability = m.predict_win(
        [list(os_blue_players.values()), list(os_red_players.values())]
    )
    if (blue_win_probability > red_win_probability) == won:
        openskill_correct_predictions += 1
    else:
        openskill_incorrect_predictions += 1

    t.update(1)

print(f"Predicted Test Matches")
_ = gc.collect()

Loading Raw Data from Test Set into Memory
Predicting Test Data:


  0%|          | 0/635 [00:00<?, ?it/s]

Predicted Test Matches


## Predict Matches using TrueSkill
We shall use the`win_probability` function provided in the package's documentation.

In [16]:
def win_probability(team1, team2):
    delta_mu = sum(r.mu for r in team1) - sum(r.mu for r in team2)
    sum_sigma = sum(r.sigma**2 for r in itertools.chain(team1, team2))
    size = len(team1) + len(team2)
    denom = math.sqrt(size * (trueskill.BETA * trueskill.BETA) + sum_sigma)
    ts = trueskill.global_env()
    return ts.cdf(delta_mu / denom)


print("Loading Raw Data from Test Set into Memory")

# Parse Test Set
print("Predicting Test Data:")

# Get Unique Matches for Training
test_size = len(verified_test_set)

# Create a Progress Bar
t = tqdm(total=test_size)

for match in verified_test_set:
    result = match.get("result")
    won = True if result == "WIN" else False

    teams: dict = match.get("teams")
    blue_team: dict = teams.get("blue")
    red_team: dict = teams.get("red")

    ts_blue_players = {}
    ts_red_players = {}

    for player in blue_team:
        ts_blue_players[player] = trueskill_players[player]

    for player in red_team:
        ts_red_players[player] = trueskill_players[player]

    blue_win_probability = win_probability(
        list(ts_blue_players.values()), list(ts_red_players.values())
    )
    red_win_probability = abs(1 - blue_win_probability)
    if (blue_win_probability > red_win_probability) == won:
        trueskill_correct_predictions += 1
    else:
        trueskill_incorrect_predictions += 1

    t.update(1)

print(f"Predicted Test Matches")
_ = gc.collect()

Loading Raw Data from Test Set into Memory
Predicting Test Data:


  0%|          | 0/635 [00:00<?, ?it/s]

Predicted Test Matches


## Results
Let's print out the result and interpret it.

In [17]:
table = Table(title="Benchmark Results")
table.add_column("Information", justify="right", style="cyan", no_wrap=True)
table.add_column("Value", style="magenta")

table.add_row("Available Matches", f"{available_matches}")
table.add_row("Valid Matches", f"{valid_matches}")

openskill_accuracy = round(
    (
        openskill_correct_predictions
        / (openskill_incorrect_predictions + openskill_correct_predictions)
    )
    * 100,
    2,
)

trueskill_accuracy = round(
    (
        trueskill_correct_predictions
        / (trueskill_incorrect_predictions + trueskill_correct_predictions)
    )
    * 100,
    2,
)

table.add_row(
    f"{model.__class__.__name__} Accuracy",
    f"{openskill_correct_predictions}/"
    f"{openskill_incorrect_predictions} "
    f"[{openskill_accuracy: .2f}%]",
)
table.add_row("OpenSkill Duration", f"{openskill_time}")

table.add_row(
    f"TrueSkill Accuracy",
    f"{trueskill_correct_predictions}/"
    f"{trueskill_incorrect_predictions} "
    f"[{trueskill_accuracy: .2f}%]",
)
table.add_row("TrueSkill Duration", f"{trueskill_time}")
speedup = (trueskill_time / openskill_time) * 100
table.add_row(f"Speedup (%)", f"{speedup: .2f}")
accuracy_bump = ((openskill_accuracy - trueskill_accuracy) / trueskill_accuracy) * 100
table.add_row(f"Accuracy Bump (%)", f"{accuracy_bump: .2f}")
rich.print(table)