In [None]:
import functools
import pickle
import collections
import tqdm
import tinydb
import subprocess
import numpy as np
import os

from slippi_ai import nametags
from slippi_db.scripts.make_local_dataset import check_replay
from slippi_db.utils import delete_from_zip

from slippi import slippi_api as slippi_api_lib
import ratelimiter
from tinydb import TinyDB, Query
import tqdm.notebook
import datetime
import logging
from slippi_ai import nametags

import pandas as pd


In [None]:
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

In [None]:
%%time
parsed_pkl_path = '/linusr/vlad/SSBM/Replays/parsed.pkl'

with open(parsed_pkl_path, 'rb') as f:
    parsed_rows = pickle.load(f)

In [None]:
phillip_rows = [
    row for row in parsed_rows
    if row['raw'].startswith('Phillip/')]
len(phillip_rows)

In [None]:
reasons = [check_replay(row, winner_only=False) for row in phillip_rows]

In [None]:
by_reason = collections.defaultdict(list)
for row, reason in zip(phillip_rows, reasons):
    by_reason[reason].append(row)

In [None]:
for reason, rows in by_reason.items():
    print(reason, len(rows))

In [None]:
ok_reasons = [None, 'unknown player vs phillip']
for reason in ok_reasons:
    assert reason in by_reason

In [None]:
ok_rows = []
for reason in ok_reasons:
    ok_rows.extend(by_reason[reason])

In [None]:
PHILLIP_CODE = 'PHAI#591'

def get_name(row):
    for player in row['players']:
        name = nametags.name_from_metadata(player)
        if name != PHILLIP_CODE:
            return name
    assert False

In [None]:
names = collections.Counter()

for row in ok_rows:
    names[get_name(row)] += 1
len(names)

In [None]:
slippi_api_lib.logger.setLevel(logging.WARN)

In [None]:
slippi_api = slippi_api_lib.SlippiRankedAPI()

In [None]:
slippi_api._limiter = ratelimiter.RateLimiter(1)

In [None]:
def get_seasons(code):
    slippi_data = slippi_api.get_player_data_throttled(code)
    if slippi_data is None:
        return None

    user = slippi_data['data']['getUser']
    if user is None:
        return None

    old_seasons = user['rankedNetplayProfileHistory']
    current_season = user['rankedNetplayProfile']
    return old_seasons + [current_season]

def get_max_rating(code) -> float | None:
    seasons = get_seasons(code)
    if seasons is None:
        return None
    if len(seasons) == 0:
        return None
    return max(season['ratingOrdinal'] for season in seasons)    

def get_data(code) -> dict:
    max_rating = get_max_rating(code)
    date = datetime.datetime.now().strftime("%Y-%m-%d")

    return dict(
        code=code,
        max_rating=max_rating,
        date=date,
        # Num games played?
        # Display Name
    )

In [None]:
DB_PATH = '/linusr/vlad/SSBM/ranks.json'

def update_codes(codes: list[str], db_path=DB_PATH):
    db = TinyDB(db_path)

    rows = db.all()
    already_seen = set(row['code'] for row in rows)
    
    for code in tqdm.notebook.tqdm(codes):
        if code in already_seen:
            continue
    
        data = get_data(code)
        db.insert(data)

    db.close()

    return {row['code']: row for row in rows}

In [None]:
code_to_data = update_codes(names)

In [None]:
codes_and_elos = [
    (code, data['max_rating'])
    for code, data in code_to_data.items()
    if data['max_rating'] is not None
]

In [None]:
codes_and_elos.sort(key=lambda x: x[1], reverse=True)

In [None]:
len(codes_and_elos)

In [None]:
def code_from_player(player: dict) -> str:
    netplay = player.get('netplay')
    if netplay is None:
        return None
    return netplay['code'].replace('＃', '#')

all_names = collections.Counter()

for row in parsed_rows:
    if not row['valid'] or not row['is_training']:
        continue

    for player in row['players']:
        code = code_from_player(player)
        name = nametags.normalize_name(code)
        if code:
            all_names[name] += 1

len(all_names)

In [None]:
all_names.most_common(10)

In [None]:
# Take into account known multiple accounts

normalized_to_elo = {}
for code, elo in codes_and_elos:
    name = nametags.normalize_name(code)
    prev_elo = normalized_to_elo.get(name)
    if prev_elo is None:
        normalized_to_elo[name] = elo
    else:
        normalized_to_elo[name] = max(elo, prev_elo)
        print(f'Found duplicate code {code}')

code_to_elo = {}

for code, _ in codes_and_elos:
    code_to_elo[code] = normalized_to_elo[nametags.normalize_name(code)]

In [None]:
_rows = []

for name, count in all_names.items():
    _rows.append(dict(
        name=name,
        elo=normalized_to_elo.get(name),
        count=count,
    ))

elo_df = pd.DataFrame(_rows)

In [None]:
df = elo_df[~elo_df['elo'].isna()].sort_values('count', ascending=False)
df = df[df['elo'] > 2000].sort_values('count', ascending=False)
df

In [None]:
def code_from_player(player: dict) -> str:
    if 'netplay' not in player:
        return None
    return player['netplay']['code'].replace('＃', '#')

def get_phil_and_opponent(players):
    phil = None
    opponent = None

    for player in row['players']:
        code = code_from_player(player)
        if code == PHILLIP_CODE:
            phil = player
        else:
            opponent = player

    return phil, opponent

def get_model(filepath: str) -> str:
    components = filepath.split('/')
    if components[0] == 'Twitchbot':
        return components[1]
    return components[0]

In [None]:
match_results = []

for row in ok_rows:
    result = {}

    result['file'] = row['name']
    result['model'] = get_model(row['name'])

    phil, opponent = get_phil_and_opponent(row['players'])
    if phil is None or opponent is None:
        continue

    winner_port = row['winner']
    if winner_port is not None:
        result['phillip_won'] = winner_port == phil['port']
    else:
        # Assume opponent quit because they were losing.
        result['phillip_won'] = True

    opponent_code = code_from_player(opponent)
    result['opponent_name'] = nametags.normalize_name(opponent_code)
    result['opponent_rating'] = code_to_elo.get(opponent_code)
    result['phillip_char'] = phil['character']
    result['opponent_char'] = opponent['character']

    for key in ['startAt']:
        result[key] = row[key]

    match_results.append(result)

num_dropped = len(ok_rows) - len(match_results)
print(f'Processed {len(match_results)} matches, dropped {num_dropped}.')

In [None]:
filters = [
    ('no opponent rating', lambda row: row['opponent_rating'] is not None),
    # ('no game winner', lambda row: 'phillip_won' in row),
]

ok_results = match_results

for reason, filt in filters:
    prev_num = len(ok_results)
    ok_results = list(filter(filt, ok_results))
    num_dropped = prev_num - len(ok_results)
    percent = num_dropped / prev_num * 100
    print(f'Dropped {num_dropped} ({percent:.1f}%) due to {reason}')

print(f'{len(ok_results)} games remain')

In [None]:
from openskill import models

In [None]:
ORDINAL_SCALING = 25.0
ORDINAL_OFFSET = 1100.0
SLIPPI_SIGMA = 3

def rating_to_slippi(rating):
    return ORDINAL_SCALING * (rating.mu - 3*rating.sigma) + ORDINAL_OFFSET

def rating_from_slippi(ordinal: float, sigma=SLIPPI_SIGMA):
    mu = (ordinal - ORDINAL_OFFSET) / ORDINAL_SCALING + 3 * sigma
    return [mu, sigma]

In [None]:
model = models.BradleyTerryFull()

In [None]:
match_results[0]

In [None]:
opponents = {}
agents = {}

for result in tqdm.notebook.tqdm(ok_results):
    name = result['opponent_name']
    if name not in opponents:
        rating = rating_from_slippi(result['opponent_rating'])
        opponents[name] = model.create_rating(rating, name=name)

    agent = result['model']
    if agent not in agents:
        agents[agent] = model.rating(name=agent)

    teams = [[agents[agent]], [opponents[name]]]
    ranks = [0, 1]
    if not result['phillip_won']:
        ranks.reverse()

    [agents[agent]], _ = model.rate(teams, ranks=ranks)

In [None]:
agent_ordinals = {
    agent: rating_to_slippi(agents[agent])
    for agent in agents
}

In [None]:
agent_df = pd.DataFrame({'ordinal': agent_ordinals.values()}, index=agent_ordinals.keys())
agent_df['num_games'] = match_df['model'].value_counts()
agent_df.sort_values('ordinal', ascending=False, inplace=True)

In [None]:
agent_df[agent_df.index.map(lambda agent: agent.startswith('auto'))

In [None]:
match_df = pd.DataFrame(ok_results)

In [None]:
win_rates = match_df.groupby('opponent_name')['phillip_won'].mean()
win_rates = win_rates.reset_index(name='phillip winrate').set_index('opponent_name')
win_rates['count'] = match_df['opponent_name'].value_counts()
win_rates['opponent_elo'] = win_rates.index.map(normalized_to_elo.get)
win_rates.sort_values('opponent_elo', ascending=False, inplace=True)
win_rates

In [None]:
win_rates['count'] = match_df['opponent_name'].value_counts()
win_rates

In [None]:
match_df['opponent_name'].value_counts()

In [None]:
win_rates

In [None]:
def get_win_rates(model: str | None) -> pd.DataFrame:
    df = match_df
    if model:
        df = df[df['model'] == model]
    win_rates = df.groupby('opponent_name')['phillip_won'].mean()
    win_rates = win_rates.reset_index(name='phillip winrate')
    win_rates['opponent_elo'] = win_rates['opponent_name'].map(normalized_to_elo.get)
    win_rates.sort_values('opponent_elo', ascending=False, inplace=True)
    return win_rates

In [None]:
get_win_rates('auto-fox')

In [None]:
(match_df['opponent_name'] == 'Trif').sum()

In [None]:
cody_df = match_df[match_df['opponent_name'] == 'Cody']
cody_df['model'].value_counts()

In [None]:
match_df['model'].value_counts()

In [None]:
# Figure out which replays to toss and which to keep

In [None]:
unknown_rows = by_reason['unknown player vs phillip']
len(unknown_rows)

In [None]:
unknown_rows[0]['name']

In [None]:
for row in unknown_rows:
    row['opponent_code'] = get_name(row)

In [None]:
unknown_df = pd.DataFrame(unknown_rows)
unknown_df['rating'] = unknown_df['opponent_code'].map(code_to_elo.get)
unknown_df['model'] = unknown_df['name'].map(get_model)

In [None]:
with_rating = unknown_df[~unknown_df['rating'].isnull()]

In [None]:
ratings = with_rating['rating']
ratings.mean()

In [None]:
np.percentile(ratings.to_numpy(), [50, 75, 90, 95, 99])

In [None]:
@functools.cache
def is_weak_model(model: str):
    return 'imitation' in model or model.startswith('basic')

unknown_df['is_weak_model'] = unknown_df['model'].map(is_weak_model)

In [None]:
unknown_df['is_weak_model'].mean()

In [None]:
phillip_df = pd.DataFrame(phillip_rows)
phillip_df['model'] = phillip_df['name'].map(get_model)
phillip_df['is_weak_model'] = phillip_df['model'].map(is_weak_model)

In [None]:
phillip_df['is_weak_model'].mean()

In [None]:
assert (~phillip_df['is_weak_model'] & (phillip_df['reason'] == 'vs weak phillip')).sum() == 0

In [None]:
to_remove = collections.defaultdict(set)

In [None]:
weak = phillip_df[phillip_df['is_weak_model']].groupby('raw')

for raw in weak.groups:
    to_remove[raw].update(weak['name'].get_group(raw))

In [None]:
MIN_RATING = 2000  # roughly diamond-2

weak_opponent = with_rating[with_rating['rating'] < MIN_RATING]
weak_opponent = weak_opponent.groupby('raw')
for raw in weak_opponent.groups:
    to_remove[raw].update(weak_opponent['name'].get_group(raw))

In [None]:
for raw, names in to_remove.items():
    print(raw, len(names))

In [None]:
raw_root = '/linusr/vlad/SSBM/Replays/Raw/'

In [None]:
%%time

for raw, names in to_remove.items():
    raw_path = os.path.join(raw_root, raw)
    assert os.path.exists(raw_path)
    delete_from_zip(raw_path, names)