In [1]:
!pip install chess

Defaulting to user installation because normal site-packages is not writeable
Collecting chess
  Downloading chess-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading chess-1.10.0-py3-none-any.whl (154 kB)
   ---------------------------------------- 0.0/154.4 kB ? eta -:--:--
   ----------------------- ---------------- 92.2/154.4 kB 5.5 MB/s eta 0:00:01
   ---------------------------------------- 154.4/154.4 kB 3.1 MB/s eta 0:00:00
Installing collected packages: chess
Successfully installed chess-1.10.0


In [6]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     ----------------------------------- -- 41.0/43.8 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 43.8/43.8 kB 714.4 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.1 MB 9.9 MB/s eta 0:00:01
   --- ---



In [17]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
   ---------------------------------------- 0.0/547.8 kB ? eta -:--:--
   ----------------------------- ---------- 409.6/547.8 kB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 547.8/547.8 kB 8.5 MB/s eta 0:00:00
Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
   ----------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1 requires mkl<=2021.4.0,>=2021.1.1, which is not installed.
s3fs 2024.6.0 requires fsspec==2024.6.0.*, but you have fsspec 2024.5.0 which is incompatible.


In [29]:
!pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch->transformers[torch])
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch->transformers[torch])
  Downloading intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch->transformers[torch])
  Downloading tbb-2021.13.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
   ---------------------------------------- 0.0/309.4 kB ? eta -:--:--
   ---------------------- ----------------- 174.1/309.4 kB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 309.4/309.4 kB 4.8 MB/s eta 0:00:00
Downloading mkl-2021.4.0-py2.py3-none-win_amd



In [1]:
## is CUDA and GPU available?
import torch

print("CUDA available: ", torch.cuda.is_available())
print("cuDNN version: ", torch.backends.cudnn.version())
print("Device name: ", torch.cuda.get_device_name(0))

device = torch.device("cuda")
print(device.type)

CUDA available:  True
cuDNN version:  8907
Device name:  NVIDIA GeForce RTX 4060 Ti
cuda


## Data Cleaning + Tokenization

In [2]:
import chess.pgn
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
import os
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# Set CUDA_LAUNCH_BLOCKING to 1 for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x1d6f3739c60>

In [4]:
# Set random seeds for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [9]:
def pgn_to_text(pgn_file_path, output_file_path):
    with open(pgn_file_path, 'r') as pgn_file:
        pgn_data = pgn_file.read()

    games = pgn_data.split('\n\n\n')

    with open(output_file_path, 'w') as output_file:
        for game in games:
            game_text = game.replace('\n', ' ')
            output_file.write(game_text + '\n')

In [10]:
pgn_to_text('./games.pgn', './games.txt')

In [5]:
#PreProcess dataset
def preprocess_chess_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    
    # Split games
    games = data.split('\n')
    
    # Process each game
    processed_games = []
    for game in games:
        game = game.replace('\n', ' ').strip()  # Remove extra spaces and newlines
        processed_games.append(game)
    
    return processed_games

In [7]:
processed_games = preprocess_chess_data("./games.txt")
dataset = Dataset.from_dict({"text": processed_games}) #TODO: fix this to all data

In [7]:
model_name = 'EleutherAI/gpt-neo-1.3B'  # or 'EleutherAI/gpt-neo-2.7B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Add a pad token to the tokenizer
print(tokenizer.add_special_tokens({'pad_token': '[PAD]'}))
model = GPTNeoForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

1


Embedding(50258, 2048)

In [8]:
# Print tokenizer vocabulary size
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
print(f"Model embedding size: {model.get_input_embeddings().weight.size(0)}")


Tokenizer vocabulary size: 50258
Model embedding size: 50258


In [9]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True) #examples["text"]

In [10]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [11]:
# Print a sample of the tokenized data for debugging
print(tokenized_datasets[0])

{'input_ids': [16, 13, 288, 19, 308, 21, 362, 13, 399, 69, 18, 347, 70, 22, 513, 13, 289, 18, 288, 20, 604, 13, 399, 66, 18, 269, 21, 642, 13, 347, 69, 19, 399, 69, 21, 718, 13, 304, 18, 347, 69, 20, 767, 13, 1195, 67, 17, 399, 17457, 22, 807, 13, 347, 67, 18, 347, 24954, 18, 860, 13, 1195, 24954, 18, 399, 65, 21, 838, 13, 275, 18, 440, 12, 46, 1367, 13, 308, 19, 304, 21, 1105, 13, 1355, 20, 399, 17457, 22, 1511, 13, 308, 20, 399, 27705, 20, 1478, 13, 288, 27705, 20, 399, 71, 20, 1315, 13, 289, 19, 1195, 64, 20, 1467, 13, 3873, 17, 275, 20, 1596, 13, 257, 18, 275, 19, 1248, 13, 7877, 65, 19, 1195, 30894, 19, 678, 13, 7567, 19, 1195, 66, 20, 1160, 13, 275, 19, 1195, 65, 21, 2310, 13, 7567, 21, 1195, 30894, 19, 2534, 13, 371, 65, 16, 399, 69, 19, 10, 2242, 13, 409, 69, 19, 1195, 26152, 19, 1987, 13, 7567, 19, 1195, 69, 20, 1679, 13, 1195, 26152, 20, 409, 69, 20, 2608, 13, 371, 65, 22, 257, 20, 2681, 13, 371, 65, 21, 269, 20, 2579, 13, 371, 65, 20, 288, 19, 2808, 13, 399, 67, 20, 371, 503

In [12]:
# Ensure all token lengths are consistent
for example in tokenized_datasets:
    input_ids_length = len(example['input_ids'])
    if input_ids_length != tokenizer.model_max_length:
        print(f"Length mismatch: {input_ids_length} != {tokenizer.model_max_length}")

In [13]:
# Check token types and inspect training data
for i, example in enumerate(tokenized_datasets):
    if i < 5:  # Print the first 5 samples for inspection
        print(f"Example {i}: {example}")
        print(f"Length of Example {i}: {len(example['input_ids'])}")

Example 0: {'input_ids': [16, 13, 288, 19, 308, 21, 362, 13, 399, 69, 18, 347, 70, 22, 513, 13, 289, 18, 288, 20, 604, 13, 399, 66, 18, 269, 21, 642, 13, 347, 69, 19, 399, 69, 21, 718, 13, 304, 18, 347, 69, 20, 767, 13, 1195, 67, 17, 399, 17457, 22, 807, 13, 347, 67, 18, 347, 24954, 18, 860, 13, 1195, 24954, 18, 399, 65, 21, 838, 13, 275, 18, 440, 12, 46, 1367, 13, 308, 19, 304, 21, 1105, 13, 1355, 20, 399, 17457, 22, 1511, 13, 308, 20, 399, 27705, 20, 1478, 13, 288, 27705, 20, 399, 71, 20, 1315, 13, 289, 19, 1195, 64, 20, 1467, 13, 3873, 17, 275, 20, 1596, 13, 257, 18, 275, 19, 1248, 13, 7877, 65, 19, 1195, 30894, 19, 678, 13, 7567, 19, 1195, 66, 20, 1160, 13, 275, 19, 1195, 65, 21, 2310, 13, 7567, 21, 1195, 30894, 19, 2534, 13, 371, 65, 16, 399, 69, 19, 10, 2242, 13, 409, 69, 19, 1195, 26152, 19, 1987, 13, 7567, 19, 1195, 69, 20, 1679, 13, 1195, 26152, 20, 409, 69, 20, 2608, 13, 371, 65, 22, 257, 20, 2681, 13, 371, 65, 21, 269, 20, 2579, 13, 371, 65, 20, 288, 19, 2808, 13, 399, 67, 2

In [14]:
# Inspect input IDs and attention masks
for i, example in enumerate(tokenized_datasets):
    if i < 5:  # Print the first 5 samples for inspection
        print(f"Input IDs {i}: {example['input_ids']}")
        print(f"Attention Masks {i}: {example['attention_mask']}")

Input IDs 0: [16, 13, 288, 19, 308, 21, 362, 13, 399, 69, 18, 347, 70, 22, 513, 13, 289, 18, 288, 20, 604, 13, 399, 66, 18, 269, 21, 642, 13, 347, 69, 19, 399, 69, 21, 718, 13, 304, 18, 347, 69, 20, 767, 13, 1195, 67, 17, 399, 17457, 22, 807, 13, 347, 67, 18, 347, 24954, 18, 860, 13, 1195, 24954, 18, 399, 65, 21, 838, 13, 275, 18, 440, 12, 46, 1367, 13, 308, 19, 304, 21, 1105, 13, 1355, 20, 399, 17457, 22, 1511, 13, 308, 20, 399, 27705, 20, 1478, 13, 288, 27705, 20, 399, 71, 20, 1315, 13, 289, 19, 1195, 64, 20, 1467, 13, 3873, 17, 275, 20, 1596, 13, 257, 18, 275, 19, 1248, 13, 7877, 65, 19, 1195, 30894, 19, 678, 13, 7567, 19, 1195, 66, 20, 1160, 13, 275, 19, 1195, 65, 21, 2310, 13, 7567, 21, 1195, 30894, 19, 2534, 13, 371, 65, 16, 399, 69, 19, 10, 2242, 13, 409, 69, 19, 1195, 26152, 19, 1987, 13, 7567, 19, 1195, 69, 20, 1679, 13, 1195, 26152, 20, 409, 69, 20, 2608, 13, 371, 65, 22, 257, 20, 2681, 13, 371, 65, 21, 269, 20, 2579, 13, 371, 65, 20, 288, 19, 2808, 13, 399, 67, 20, 371, 5036

## Time to Fine-Tune!

In [15]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduce batch size
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    eval_strategy="steps",
    logging_dir='./logs',
    logging_steps=500,
    fp16=False,  # Disable fp16 training
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

  0%|          | 0/10 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 