In [None]:
import collections
import pickle
import pandas as pd
import os
import peppi_py
import tree
import json
import io
import math
import tqdm.notebook
import functools
import zipfile

from slippi_db import utils, preprocessing, parse_peppi

In [None]:
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

In [None]:
root = '/linusr/vlad/SSBM/Replays'

In [None]:
parsed_path = os.path.join(root, 'parsed.pkl')

with open(parsed_path, 'rb') as f:
    data_bytes = f.read()
len(data_bytes)

In [None]:
%%time
data = pickle.loads(data_bytes)
del data_bytes
len(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
@functools.cache
def compact_raw(raw: str) -> str:
    if raw.startswith('Players/'):
        return raw.split('/')[1]
    if raw.startswith('Phillip/'):
        return 'Phillip'
    return raw

df['compact_raw'] = df['raw'].map(compact_raw)

In [None]:
@functools.cache
def raw_exists(raw: str) -> bool:
    if not raw.endswith('.zip'):
        return False
    return os.path.exists(os.path.join(root, 'Raw', raw))

df['raw_exists'] = df['raw'].map(raw_exists)

In [None]:
def get_files_in_zip(path: str) -> list[str]:
    filenames = []
    with zipfile.ZipFile(path, mode='r') as zf:
        for info in zf.infolist():
            if info.is_dir():
                continue
            filenames.append(info.filename)
    return set(filenames)

In [None]:
raw_to_files = {}
for raw in tqdm.notebook.tqdm(set(df['raw'])):
    if not raw.endswith('.zip'):
        continue
    if not raw_exists(raw):
        continue

    raw_to_files[raw] = get_files_in_zip(os.path.join(root, 'Raw', raw))

In [None]:
missing_files = {}

for raw, name in zip(df['raw'], df['name']):
    if raw not in raw_to_files:
        continue
    if name not in raw_to_files[raw]:
        missing_files.setdefault(raw, []).append(name)

assert not missing_files

In [None]:
file_to_raw = {}
for raw, files in raw_to_files.items():
    for f in files:
        # File could be in two different dumps, one for each player
        file_to_raw[f] = raw

In [None]:
# Fix bad raws.
# Could be slightly wrong for files that exist in multiple raws,
# which may be slightly different due to different rollback frames.

for row in data:
    raw = row['raw']
    if raw not in raw_to_files:
        continue
    name = row['name']
    if name not in raw_to_files[raw]:
        row['raw'] = file_to_raw[name]

In [None]:
# %%time

# with open(os.path.join(root, 'parsed.pkl'), 'wb') as f:
#     pickle.dump(data, f)

In [None]:
valid = df[df['valid']]
training = valid[valid['is_training']]
len(training)

In [None]:
training_with_raw = training[training['raw_exists']]
len(training_with_raw)

In [None]:
for_test_dataset = training_with_raw.groupby('raw').tail(50).reset_index()
size = len(for_test_dataset)
size

In [None]:
files_to_copy = {}
for raw, filename in zip(for_test_dataset['raw'], for_test_dataset['name']):
    files_to_copy.setdefault(raw, []).append(filename)

In [None]:
dest_zip = os.path.join(root, f'TestDataset-{size}.zip')

if os.path.exists(dest_zip):
    os.remove(dest_zip)

sources_and_files = [
    (os.path.join(root, 'Raw', raw), filenames)
    for raw, filenames in files_to_copy.items()
]

utils.copy_multi_zip_files(sources_and_files, dest_zip)

In [None]:
all_raw = set(df['raw'])

In [None]:
player_raws = {}
for raw in all_raw:
    if not raw.startswith('Players/'):
        continue
    player = raw.split('/')[1]
    player_raws.setdefault(player, []).append(raw)


print(','.join(sorted(player_raws.keys())))