## Rank Accuracy Benchmarks
This is a reproducible benchmark that checks the accuracy of [openskill.py](https://openskill.me/) against a large dataset of PUBG matches.

Let's start by importing the necessary libraries:

In [1]:
%pip install -U polars rbo rich numpy tqdm pooch ipywidgets openskill

Collecting rbo
  Downloading rbo-0.1.3-py3-none-any.whl.metadata (4.6 kB)
Collecting rich
  Downloading rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting numpy
  Downloading numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting ipywidgets
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting openskill
  Downloading openskill-6.0.0-py3-none-any.whl.metadata (6.7 kB)
Collecting widgetsnbextension~=4.0.11 (from ipywidgets)
  Downloading widgetsnbextension-4.0.11-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.11 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.11-py3-none-any.whl.metadata (4.1 kB)
Downloading rbo-0.1.3-py3-none-any.whl (7.8 kB)
Downloading rich-13.7.1-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

## Import Libraries
We going to import polars to do the data loading and rich to display things in user friendly format.
We are also going to import ipywidget for native iteractivity within the browser.

In [2]:
import gc
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict

import ipywidgets as widgets
import numpy as np
import polars as pl
import rich
from pooch import DOIDownloader
from rbo import rbo
from rich.table import Table
from tqdm.notebook import tqdm

import openskill
from openskill.models import (
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
)

## Check OpenSkill Version

In [3]:
print(openskill.__version__)

6.0.0


## Download Data
The data we need is not available, yet. So let's download it.
It's available [here](https://zenodo.org/records/10342317).


In [4]:
downloader = DOIDownloader(progressbar=True)

working_directory = Path.cwd()
data_directory = Path(working_directory / Path("data"))
data_directory.mkdir(exist_ok=True)
downloader(
    url="doi:10.5281/zenodo.10342317/train.parquet",
    output_file=data_directory / "train.parquet",
    pooch=None,
)
time.sleep(3)
downloader(
    url="doi:10.5281/zenodo.10342317/test.parquet",
    output_file=data_directory / "test.parquet",
    pooch=None,
)

100%|█████████████████████████████████████| 1.64G/1.64G [00:00<00:00, 1.34TB/s]
100%|████████████████████████████████████████| 369M/369M [00:00<00:00, 305GB/s]


## Define Data Containers
We need to define some data container classes. We shall use dataclasses for this purpose with slots enabled.

In [5]:
@dataclass(slots=True)
class Player:
    name: str
    kill_ratio: float
    assist_ratio: float


@dataclass(slots=True)
class Team:
    id: int
    match_id: str
    rank: int
    players: Dict[str, Player]


@dataclass(slots=True)
class Match:
    id: str
    teams: Dict[int, Team]

## Select Model
OpenSkill comes with 5 models. Let's pick one for this benchmark.

In [6]:
models = [
    BradleyTerryFull,
    BradleyTerryPart,
    PlackettLuce,
    ThurstoneMostellerFull,
    ThurstoneMostellerPart,
]

widget = widgets.Select(
    options=[m.__name__ for m in models],
    value=PlackettLuce.__name__,
    description="Model:",
    disabled=False,
)
display(widget)

Select(description='Model:', index=2, options=('BradleyTerryFull', 'BradleyTerryPart', 'PlackettLuce', 'Thurst…

## Initialize Model
Let's call the constructor on the model.

In [7]:
widget.close()
m = widget.value
m = eval(m)
model = m(balance=True)
print(str(model))

Plackett-Luce Model Parameters: 

mu: 25.0
sigma: 8.333333333333334



## Load Data
We are going to use the Polars library to lazily scan the parquet files.

In [8]:
# Load Data
train = pl.scan_parquet(data_directory / "train.parquet")
test = pl.scan_parquet(data_directory / "test.parquet")
train = train.collect(streaming=True)

## Optimize Dataframe
Let's change the data types to reduce memory usage.

In [9]:
def reduce_memory_usage_pl(df, name):
    """
    Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
    Original pandas version of this function:
    https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
    """
    print(f"Memory usage of dataframe {name} is {round(df.estimated_size('mb'), 2)} MB")
    Numeric_Int_types = [pl.Int8, pl.Int16, pl.Int32, pl.Int64]
    Numeric_Float_types = [pl.Float32, pl.Float64]
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    print(
        f"Memory usage of dataframe {name} became {round(df.estimated_size('mb'), 2)} MB"
    )
    return df


# Reduce Memory Usage
train = reduce_memory_usage_pl(train, "train")
_ = gc.collect()

Memory usage of dataframe train is 11461.2 MB
Memory usage of dataframe train became 3008.43 MB


Let's also hold all rating data in a single variable:

In [10]:
# Data Container
openskill_players = {}

## Parse Data for Training Set
Let's now parse the training data and put it into our dataclasses.

In [11]:
print("Loading Raw Data from Training Set into Memory")

# Parse Training Data
print("Parsing Training Data:")

# Get Unique Matches for Training
train_size = len(train)

# Create a Progress Bar
t = tqdm(total=train_size)

# Training Data
train_matches: Dict[str, Match] = {}

for raw_player in train.iter_rows(named=True):
    player = Player(
        name=raw_player["player_name"],
        kill_ratio=raw_player["kill_ratio"],
        assist_ratio=raw_player["assist_ratio"],
    )

    match_id = raw_player["match_id"]
    team_id = raw_player["team_id"]
    if match_id not in train_matches:
        team = Team(
            id=raw_player["team_id"],
            match_id=raw_player["match_id"],
            rank=raw_player["team_placement"],
            players={player.name: player},
        )

        match = Match(id=raw_player["match_id"], teams={team.id: team})
    else:
        if team_id not in train_matches[match_id].teams:
            match = train_matches[match_id]
            team = Team(
                id=raw_player["team_id"],
                match_id=raw_player["match_id"],
                rank=raw_player["team_placement"],
                players={player.name: player},
            )
            match.teams[team_id] = team
        else:
            match = train_matches[match_id]
            match.teams[team_id].players[player.name] = player

    train_matches[match_id] = match
    t.update(1)

print(f"Parsed {len(train_matches)} Training Matches")
gc.collect()

Loading Raw Data from Training Set into Memory
Parsing Training Data:


  0%|          | 0/51704546 [00:00<?, ?it/s]

Parsed 583975 Training Matches


0

## Intialize OpenSkill Ratings for Train Set
Next let's use the ``model.rating`` method to create new rating objects for each player in the training set.

In [12]:
# Initialize OpenSkill Players for Training
print("Initializing Players for Training Set:")

# Create a Progress Bar
t = tqdm(total=train_size)

for match_id, match in train_matches.items():
    for team_id, team in match.teams.items():
        for player_name, player in team.players.items():
            player_rating = model.rating(name=player_name)
            openskill_players[player_name] = player_rating
            t.update(1)

# Rate OpenSkill Players for Training
print("Rate Training Matches:")
t = tqdm(total=len(train_matches))

for match_id, match in train_matches.items():
    teams_to_rate = []
    ranks = []
    for team_id, team in match.teams.items():
        teams = []
        ranks.append(team.rank)
        for player_name, player in team.players.items():
            player_rating = openskill_players[player_name]
            teams.append(player_rating)
        teams_to_rate.append(teams)

    if len(teams_to_rate) > 1:
        rated_teams = model.rate(teams=teams_to_rate, ranks=ranks)

        for team in rated_teams:
            for player in team:
                openskill_players[player.name] = player
    t.update(1)
gc.collect()

Initializing Players for Training Set:


  0%|          | 0/51704546 [00:00<?, ?it/s]

Rate Training Matches:


  0%|          | 0/583975 [00:00<?, ?it/s]

19

## Clear Memory
Let's delete some variables so we can save some memory and load the test set.

In [13]:
del train
del train_matches
gc.collect()

test = test.collect(streaming=True)
test = reduce_memory_usage_pl(test, "test")
gc.collect()

Memory usage of dataframe test is 2546.47 MB
Memory usage of dataframe test became 668.96 MB


0

## Parsing Test Set
Now let's do the same thing for the test set and grab the matches.

In [14]:
# Get Unique Matches for Testing
print("Loading Raw Data from Test Set into Memory")

test_size = len(test)

# Parse Data
print("Parsing Test Data:")

t = tqdm(total=test_size)

# Test Data
test_matches: Dict[str, Match] = {}

for raw_player in test.iter_rows(named=True):
    player = Player(
        name=raw_player["player_name"],
        kill_ratio=raw_player["kill_ratio"],
        assist_ratio=raw_player["assist_ratio"],
    )

    match_id = raw_player["match_id"]
    team_id = raw_player["team_id"]
    if match_id not in test_matches:
        team = Team(
            id=raw_player["team_id"],
            match_id=raw_player["match_id"],
            rank=raw_player["team_placement"],
            players={player.name: player},
        )

        match = Match(id=raw_player["match_id"], teams={team.id: team})
    else:
        if team_id not in test_matches[match_id].teams:
            match = test_matches[match_id]
            team = Team(
                id=raw_player["team_id"],
                match_id=raw_player["match_id"],
                rank=raw_player["team_placement"],
                players={player.name: player},
            )
            match.teams[team_id] = team
        else:
            match = test_matches[match_id]
            match.teams[team_id].players[player.name] = player

    test_matches[match_id] = match
    t.update(1)

Loading Raw Data from Test Set into Memory
Parsing Test Data:


  0%|          | 0/11487236 [00:00<?, ?it/s]

## Predict Test Set
Let's use the data we trained on to try and predict the test set.

In [15]:
# Predict OpenSkill Matches
print("Predict Matches in Test Set using OpenSkill:")
t = tqdm(total=len(test_matches))

# Accuracy Trackers
rbo_scores = []
openskill_correct_predictions = 0
openskill_incorrect_predictions = 0

for match_id, match in test_matches.items():
    teams_to_predict = []
    actual_ranks = []
    for team_id, team in match.teams.items():
        teams = []
        actual_ranks.append(team.rank)
        for player_name, player in team.players.items():
            if player_name in openskill_players:
                player_rating = openskill_players[player_name]
            else:
                player_rating = model.rating(name=player_name)
            teams.append(player_rating)
        teams_to_predict.append(teams)

    if len(teams_to_predict) > 1:
        actual_ranks = {_[0]: _[1] for _ in zip(actual_ranks, teams_to_predict)}
        predictions = [_[0] for _ in model.predict_rank(teams_to_predict)]
        expected_ranks = {_[0]: _[1] for _ in zip(predictions, teams_to_predict)}

        try:
            actual_ranks = dict(
                sorted(
                    actual_ranks.items(),
                    key=lambda x: [*expected_ranks.values()].index(x[1]),
                )
            )

            ar_index = next(iter(actual_ranks))
            er_index = next(iter(expected_ranks))

            similarity = rbo.RankingSimilarity(
                list(actual_ranks.keys()), list(expected_ranks.keys())
            ).rbo_ext()
            rbo_scores.append(similarity)

            if actual_ranks[ar_index] == expected_ranks[er_index]:
                openskill_correct_predictions += 1
            else:
                openskill_incorrect_predictions += 1
        except ValueError:
            pass
    t.update(1)

Predict Matches in Test Set using OpenSkill:


  0%|          | 0/145994 [00:00<?, ?it/s]

## Results
Let's print out the result and interpret it.

In [16]:
table = Table(title="Benchmark Results")
table.add_column("Information", justify="right", style="cyan", no_wrap=True)
table.add_column("Value", style="magenta")

table.add_row("Available Matches", f"{len(test_matches)}")

openskill_accuracy = round(
    (
        openskill_correct_predictions
        / (openskill_incorrect_predictions + openskill_correct_predictions)
    )
    * 100,
    2,
)

table.add_row(
    f"{model.__class__.__name__} Accuracy",
    f"{openskill_correct_predictions}/"
    f"{openskill_incorrect_predictions} "
    f"[{openskill_accuracy: .2f}%]",
)
rbo_score = (sum(rbo_scores) / len(rbo_scores)) * 100
table.add_row("Rank-Biased Overlap Score: ", f"{rbo_score: .2f}")
rich.print(table)