## Draw Accuracy Benchmarks
This is a reproducible benchmark that checks the accuracy of [openskill.py](https://openskill.me/) against a small dataset of chess matches.

Let's start by importing the necessary libraries:

In [1]:
%pip install pandas rbo rich numpy scikit-learn tqdm pooch ipywidgets openskill

Collecting rbo
  Obtaining dependency information for rbo from https://files.pythonhosted.org/packages/f0/b3/aa1923e0ed19ecf190f7e8d9fe939f9020dd601b64e190b1f58b3692be8e/rbo-0.1.3-py3-none-any.whl.metadata
  Downloading rbo-0.1.3-py3-none-any.whl.metadata (4.6 kB)
Collecting openskill
  Obtaining dependency information for openskill from https://files.pythonhosted.org/packages/93/e3/1db14897e93d38dd9e7ec564d0428959a8ff5522e28dc825f6466a4532f8/openskill-6.0.0-py3-none-any.whl.metadata
  Downloading openskill-6.0.0-py3-none-any.whl.metadata (6.7 kB)
Downloading rbo-0.1.3-py3-none-any.whl (7.8 kB)
Downloading openskill-6.0.0-py3-none-any.whl (50 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rbo, openskill
Successfully installed openskill-6.0.0 rbo-0.1.3
Note: you may need to restart the kernel to use updated packages.


## Import Libraries
We going to import polars to do the data loading and rich to display things in user friendly format.
We are also going to import ipywidget for native iteractivity within the browser.

In [2]:
import gc
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Dict, List

import ipywidgets as widgets
import pandas as pd
import rich
from pooch import DOIDownloader
from rbo import rbo
from rich.table import Table
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import openskill
from openskill.models import (
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
)

## Check OpenSkill Version

In [3]:
print(openskill.__version__)

6.0.0


## Download Data
The data we need is not available, yet. So let's download it.

In [4]:
downloader = DOIDownloader(progressbar=True)

working_directory = Path.cwd()
data_directory = Path(working_directory / Path("data"))
data_directory.mkdir(exist_ok=True)
downloader(
    url="doi:10.5281/zenodo.10344773/chess.csv",
    output_file=data_directory / "chess.csv",
    pooch=None,
)

0.00B [00:00, ?B/s]


In [5]:
class Result(Enum):
    WHITE_WINS = 1
    BLACK_WINS = 2
    STALEMATE = 3


@dataclass(slots=True)
class Player:
    name: str


@dataclass(slots=True)
class Match:
    result: Result
    players: Dict[str, Player]

## Select Model
OpenSkill comes with 5 models. Let's pick one for this benchmark.

In [6]:
models = [
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
]

widget = widgets.Select(
    options=[m.__name__ for m in models],
    value=PlackettLuce.__name__,
    description="Model:",
    disabled=False,
)
display(widget)

Select(description='Model:', index=2, options=('BradleyTerryFull', 'BradleyTerryPart', 'PlackettLuce', 'Thurst…

## Initialize Model
Let's call the constructor on the model.

In [7]:
widget.close()
m = widget.value
m = eval(m)
model = m()
print(str(model))

Plackett-Luce Model Parameters: 

mu: 25.0
sigma: 8.333333333333334



## Load Data
We are going to use the Pandas to load the CSV file.

In [8]:
# Load Data
df = pd.read_csv(data_directory / "chess.csv", index_col=0)

# Split Data
train, test = train_test_split(df, test_size=0.3, random_state=27)

Let's also hold all rating data in a single variable:

In [9]:
# Data Container
openskill_players = {}

## Parse Data for Training Set
Let's now parse the training data and put it into our dataclasses.

In [10]:
print("Loading Raw Data from Training Set into Memory")

# Parse Training Data
print("Parsing Training Data:")

# Get Unique Matches for Training
train_size = len(train)

# Create a Progress Bar
t = tqdm(total=train_size)

# Training Data
train_matches: List[Match] = []

for match_index, row in train.iterrows():
    white_player = Player(name=row["white_username"])
    black_player = Player(name=row["black_username"])
    players = {row["white_username"]: white_player, row["black_username"]: black_player}

    white_result = row["white_result"]
    black_result = row["black_result"]

    if white_result == "win":
        match = Match(result=Result.WHITE_WINS, players=players)
    elif black_result == "win":
        match = Match(result=Result.BLACK_WINS, players=players)
    else:
        match = Match(result=Result.STALEMATE, players=players)

    train_matches.append(match)
    t.update(1)

print(f"Parsed {len(train_matches)} Training Matches")
_ = gc.collect()

Loading Raw Data from Training Set into Memory
Parsing Training Data:


  0%|          | 0/46815 [00:00<?, ?it/s]

Parsed 46815 Training Matches


## Intialize OpenSkill Ratings for Train Set
Next let's use the ``model.rating`` method to create new rating objects for each player in the training set.

In [11]:
# Initialize OpenSkill Players for Training
print("Initializing Players for Training Set:")

# Create a Progress Bar
t = tqdm(total=train_size)

for match in train_matches:
    for player_name, player in match.players.items():
        player_rating = model.rating(name=player_name)
        openskill_players[player_name] = player_rating
    t.update(1)

# Rate OpenSkill Players for Training
print("Rate Training Matches:")
t = tqdm(total=len(train_matches))

for match in train_matches:
    player_1, player_2 = match.players.keys()
    player_1_rating = openskill_players[player_1]
    player_2_rating = openskill_players[player_2]
    team_1 = [player_1_rating]
    team_2 = [player_2_rating]

    if match.result == Result.WHITE_WINS:
        ranks = [1, 2]
    elif match.result == Result.BLACK_WINS:
        ranks = [2, 1]
    else:
        ranks = [1, 1]

    rated_teams = model.rate(teams=[team_1, team_2], ranks=ranks)

    for team in rated_teams:
        for player in team:
            openskill_players[player.name] = player

    t.update(1)

_ = gc.collect()

Initializing Players for Training Set:


  0%|          | 0/46815 [00:00<?, ?it/s]

Rate Training Matches:


  0%|          | 0/46815 [00:00<?, ?it/s]

## Clear Memory
Let's delete some variables so we can save some memory.

In [12]:
del train
del train_matches
_ = gc.collect()

## Parsing Test Set
Now let's do the same thing for the test set and grab the matches.

In [13]:
# Get Unique Matches for Testing
print("Loading Raw Data from Test Set into Memory")

test_size = len(test)

# Parse Data
print("Parsing Test Data:")

t = tqdm(total=test_size)

# Test Data
test_matches: List[Match] = []

for match_index, row in test.iterrows():
    white_player = Player(name=row["white_username"])
    black_player = Player(name=row["black_username"])
    players = {row["white_username"]: white_player, row["black_username"]: black_player}

    white_result = row["white_result"]
    black_result = row["black_result"]

    if white_result == "win":
        match = Match(result=Result.WHITE_WINS, players=players)
    elif black_result == "win":
        match = Match(result=Result.BLACK_WINS, players=players)
    else:
        match = Match(result=Result.STALEMATE, players=players)

    test_matches.append(match)
    t.update(1)

Loading Raw Data from Test Set into Memory
Parsing Test Data:


  0%|          | 0/20064 [00:00<?, ?it/s]

## Predict Test Set
Let's use the data we trained on to try and predict the test set.

In [14]:
# Predict OpenSkill Matches
print("Predict Matches in Test Set using OpenSkill:")
t = tqdm(total=len(test_matches))

# Accuracy Trackers
openskill_correct_predictions = 0
openskill_incorrect_predictions = 0

for match in test_matches:
    actual_result = match.result
    if actual_result == Result.STALEMATE:
        draw = True
    else:
        draw = False

    player_1, player_2 = match.players.keys()

    if player_1 in openskill_players:
        player_1_rating = openskill_players[player_1]
    else:
        player_1_rating = model.rating(name=player_1)

    if player_2 in openskill_players:
        player_2_rating = openskill_players[player_2]
    else:
        player_2_rating = model.rating(name=player_2)

    teams = [[player_1_rating], [player_2_rating]]

    white_win_probability, black_win_probability = model.predict_win(teams)
    draw_probability = model.predict_draw(teams)

    if draw_probability > (white_win_probability + black_win_probability):
        if draw:
            openskill_correct_predictions += 1
        else:
            openskill_incorrect_predictions += 1
    else:
        if not draw:
            openskill_correct_predictions += 1
        else:
            openskill_incorrect_predictions += 1

    t.update(1)

Predict Matches in Test Set using OpenSkill:


  0%|          | 0/20064 [00:00<?, ?it/s]

## Results
Let's print out the result and interpret it.

In [15]:
table = Table(title="Benchmark Results")
table.add_column("Information", justify="right", style="cyan", no_wrap=True)
table.add_column("Value", style="magenta")

table.add_row("Available Matches", f"{len(test_matches)}")

openskill_accuracy = round(
    (
        openskill_correct_predictions
        / (openskill_incorrect_predictions + openskill_correct_predictions)
    )
    * 100,
    2,
)

table.add_row(
    f"{model.__class__.__name__} Accuracy",
    f"{openskill_correct_predictions}/"
    f"{openskill_incorrect_predictions} "
    f"[{openskill_accuracy: .2f}%]",
)
rich.print(table)