In [8]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset  # huggingface datasets
import pickle
from collections import defaultdict
# number of workers in .map() call
# good number to use is ~order number of cpu cores // 2
num_proc = 14
dtype = np.uint8  # Currently there are only 32 tokens in the chess LLMs vocab

In [35]:
dataset_path = "adamkarvonen/chess_games"
file_path = "lichess_200k_elo_bins.zip"
# file_path = "smaller_pgn_file_blocks.zip"

# Load the dataset
dataset = load_dataset(dataset_path, data_files=file_path)

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
dataset['train'][-1]

{'Unnamed: 0.1': None,
 'Unnamed: 0': 163357,
 'WhiteElo': 694,
 'BlackElo': 830,
 'Result': '1-0',
 'transcript': '1.d4 Nc6 2.c3 Nf6 3.f3 b6 4.e4 Ba6 5.Bxa6 e6 6.Bb5 Nxd4 7.cxd4 Bb4+ 8.Bd2 a5 9.Bxb4 axb4 10.Ne2 Qe7 11.e5 Nd5 12.a3 c6 13.Bc4 b3 14.Bxb3 Ne3 15.Qd3 Nxg2+ 16.Kf2 Qg5 17.Rg1 Ne3 18.Rxg5 b5 19.Kxe3 f6 20.a4 fxg5 21.axb5 c5 22.dxc5 O-O 23.Qe4 Rfc8 24.Rxa8 Rxa8 25.Qxa8+ Kf7 26.b6 Kg6 27.b7 Kh5 28.b8=Q Kh4 29.Qd6 Kh3 30.Qh8 Kxh2 31.Qxh7+ Kg2 32.Qxe6 Kf1 33.Qeh3+ Ke1 34.Qc2 g4 35.Qxg4 g6 36.Qg3+ Kf1 37.Qf2#',
 'elo_bin': '[600, 700)'}

In [6]:
elo_bins = dataset['train']['elo_bin']
bins = set(elo_bins)
bins

{'[1000, 1100)',
 '[1100, 1200)',
 '[1200, 1300)',
 '[1300, 1400)',
 '[1400, 1500)',
 '[1500, 1600)',
 '[1600, 1700)',
 '[1700, 1800)',
 '[1800, 1900)',
 '[1900, 2000)',
 '[2000, 2100)',
 '[2100, 2200)',
 '[2200, 2300)',
 '[2300, 2400)',
 '[2400, 2500)',
 '[2500, 2600)',
 '[2600, 2700)',
 '[2700, 2800)',
 '[2800, 2900)',
 '[2900, 3000)',
 '[3000, 3100)',
 '[3100, 3200)',
 '[600, 700)',
 '[700, 800)',
 '[800, 900)',
 '[900, 1000)'}

In [12]:

elo_bin_idx = defaultdict(list)
for idx, elo in enumerate(elo_bins):
    elo_bin_idx[elo].append(idx)

In [10]:
training_ranges = [[600, 1100], [1100, 1500], [1500, 1900]]

In [13]:
training_range_idx = defaultdict(list)
for tr in training_ranges:
    for b in bins:
        bin_range = eval(b.replace('[', '('))
        if bin_range[0] >= tr[0] and bin_range[0] < tr[1]:
            training_range_idx[str(tr)] += (elo_bin_idx[b])

for t in training_range_idx:
    print(t, 'has number of games: ', len(training_range_idx[t]))

[600, 1100] has number of games:  632275
[1100, 1500] has number of games:  1600000
[1500, 1900] has number of games:  1600000


In [37]:
datasets = [dataset['train'].select(training_range_idx[t]) for t in training_range_idx]

In [42]:
for idx, t in enumerate(training_range_idx):
    dataset = datasets[idx]
    # by default only contains the 'train' split, so create a test split
    split_dataset = dataset.train_test_split(
        test_size=0.01, seed=2357, shuffle=True
    )
    split_dataset["val"] = split_dataset.pop("test")  # rename the test split to val

    # this results in:
    # >>> split_dataset
    # DatasetDict({
    #     train: Dataset({
    #         features: ['text'],
    #         num_rows: 8009762
    #     })
    #     val: Dataset({
    #         features: ['text'],
    #         num_rows: 4007
    #     })
    # })

    # we now want to tokenize the dataset. Using meta.pkl in the same directory as this file
    meta_path = os.path.join(os.path.dirname(__file__), "meta.pkl")
    # meta_path = '/home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/meta.pkl'
    with open(meta_path, "rb") as f:
        meta = pickle.load(f)

    stoi = meta["stoi"]
    itos = meta["itos"]

    # to read the bin files later, e.g. with numpy:
    # m = np.memmap('train.bin', dtype=np.uint8, mode='r')
    # print(split_dataset["val"][0])
    # print(len(split_dataset["val"]["transcript"][0]))

    # For verifying that all games are 1024 tokens long
    # for game in split_dataset["train"]["transcript"]:
    #     if len(game) != 1024:
    #         print(len(game))
    #         print(game)
    #         break
    # print(stoi)

    column_name = "transcript"

    def process(example):
        ids = np.array([stoi[c] for c in example[column_name]], dtype=dtype)
        out = {"ids": ids, "len": len(ids)}
        return out

    # tokenize the dataset
    tokenized = split_dataset.map(
        process,
        remove_columns=[column_name],
        desc="tokenizing the splits",
        num_proc=num_proc,
    )

    # print(tokenized["val"]["ids"])

    # concatenate all the ids in each dataset into one large file we can use for training
    for split, dset in tokenized.items():
        arr_len = np.sum(dset["len"], dtype=np.uint64)
        print(f"{split} has {arr_len} tokens")
        # dname = '/home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/'
        dname = os.path.dirname(__file__)
        filename = os.path.join(dname, f"{split}_{t[0]}_{t[1]}.bin")
        
        arr = np.memmap(filename, dtype=dtype, mode="w+", shape=(arr_len,))
        print(arr.shape)
        total_batches = 1024

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"):
            # Batch together samples for faster write
            batch = dset.shard(
                num_shards=total_batches, index=batch_idx, contiguous=True
            ).with_format("numpy")
            # print(batch[0])
            arr_batch = np.concatenate(batch["ids"])
            # print(arr_batch)
            # print(arr_batch.shape)
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()


train has 210176343 tokens
(210176343,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/train_[600, 1100].bin: 100%|██████████| 1024/1024 [00:04<00:00, 210.07it/s]


val has 2124032 tokens
(2124032,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/val_[600, 1100].bin: 100%|██████████| 1024/1024 [00:01<00:00, 660.63it/s]
tokenizing the splits (num_proc=14): 100%|██████████| 1584000/1584000 [00:20<00:00, 77333.35 examples/s] 
tokenizing the splits (num_proc=14): 100%|██████████| 16000/16000 [00:00<00:00, 21466.72 examples/s]


train has 578241897 tokens
(578241897,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/train_[1100, 1500].bin: 100%|██████████| 1024/1024 [00:10<00:00, 100.35it/s]


val has 5843321 tokens
(5843321,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/val_[1100, 1500].bin: 100%|██████████| 1024/1024 [00:01<00:00, 635.63it/s]
tokenizing the splits (num_proc=14): 100%|██████████| 1584000/1584000 [00:20<00:00, 78212.63 examples/s] 
tokenizing the splits (num_proc=14): 100%|██████████| 16000/16000 [00:00<00:00, 21273.30 examples/s]


train has 628669971 tokens
(628669971,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/train_[1500, 1900].bin: 100%|██████████| 1024/1024 [00:10<00:00, 98.24it/s]


val has 6342299 tokens
(6342299,)


writing /home/ezipe/git/chess_transformer_mothership/chess-nanoGPT/data/lichess_hf_dataset/val_[1500, 1900].bin: 100%|██████████| 1024/1024 [00:01<00:00, 621.45it/s]
