In [1]:
import pandas as pd 
import torch 

import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
import torch.optim as optim
from torch.optim.swa_utils import AveragedModel
import numpy as np
import torch.nn as nn
import importlib
import pickle
import utils
import models
importlib.reload(utils)
from utils import *
importlib.reload(models)
from models import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
grouped_df = pd.read_csv("../data/haha_longer-jan.csv")
print(grouped_df)

         game_id                                              moves  \
0       00VRM44j  e2e4 g8f6 e4e5 f6g8 f1b5 f7f6 e5e6 c7c6 e6d7 c...   
1       0MG5pz3L  c2c4 g7g6 e2e4 f8g7 d2d3 d7d6 f1e2 g8f6 g1f3 e...   
2       0MV7dmua  e2e4 e7e5 f2f4 e5f4 g1f3 d7d5 e4d5 d8d5 b1c3 d...   
3       0UNsyqEb  d2d4 d7d5 c2c4 g8f6 b1c3 c8f5 c1g5 e7e6 e2e3 h...   
4       10mypbCS  d2d4 g7g6 c1h6 f8h6 g1f3 g8f6 e2e3 b8c6 f1b5 d...   
...          ...                                                ...   
164316  yckDW8pr  d2d4 d7d5 c1f4 c8f5 e2e3 e7e6 f1d3 f5g6 g1f3 g...   
164317  yjFqLLIM  d2d4 g8f6 c2c4 e7e6 g1f3 d7d5 e2e3 c7c6 b2b3 f...   
164318  yzgxpBPa  d2d4 d7d5 c2c4 e7e6 b1c3 g8f6 c1g5 f8b4 d1c2 b...   
164319  zHpwiwWK  e2e4 c7c5 g1f3 b8c6 f1b5 d7d6 e1g1 e7e5 c2c3 g...   
164320  zI8GQITc  e2e4 e7e5 g1f3 b8c6 f1b5 a7a6 b5a4 b7b5 a4b3 c...   

        white_elo  black_elo  white_active  \
0            1523       1500          True   
1            1538       1580          True   
2        

In [27]:
with open('./../data/jan-march/vocab.pkl', 'rb') as inp:
    vocab_old = pickle.load(inp)

In [9]:
trainX, trainY, seq_lengths, vocab = df_to_data_fen_only_padded(grouped_df, fixed_window=True, sampling_rate=0.25)

In [11]:
trainY, seq_lengths = np.asarray(trainY), np.asarray(seq_lengths)

In [3]:
trainX_sequences, fens, trainX, trainY, vocab = df_to_data_black(grouped_df, fixed_window=True, sampling_rate=0.25)
trainX_sequences, trainX_seqlengths  = pad_sequences(trainX_sequences)

In [4]:
trainX, trainX_sequences, fens, trainX_seqlengths, trainY = np.asarray(trainX), np.asarray(trainX_sequences), np.asarray(fens), np.asarray(trainX_seqlengths), np.asarray(trainY)

This is the last step of saving our data onto our disk so that it's ready to load as we train

In [5]:
def save_as_memmap(array, filename):
    # Determine the dtype and shape of the array to create a compatible memmap
    dtype = array.dtype
    shape = array.shape
    
    # Create a memmap file with write mode, which will also allocate the disk space
    memmap_array = np.memmap(filename, dtype=dtype, mode='w+', shape=shape)
    
    # Copy the data into the memmap array
    memmap_array[:] = array[:]
    
    # Flush memory changes to disk to ensure all data is written
    memmap_array.flush()

    # Return the path for confirmation
    return filename


In [6]:
# for black multi-modal
filenames = [
    save_as_memmap(trainX_sequences, './../data/black-data/jan/trainX_sequences.memmap'),
    save_as_memmap(trainX, './../data/black-data/jan/trainX.memmap'),
    save_as_memmap(trainY, './../data/black-data/jan/trainY.memmap'),
    save_as_memmap(trainX_seqlengths, './../data/black-data/jan/trainX_seqlengths.memmap')
]

df = pd.DataFrame(fens, columns=['fens'])
csv_filename = './../data/black-data/jan/fens.csv'
df.to_csv(csv_filename, index=False)

with open('./../data/black-data/jan/vocab.pkl', 'wb') as outp:
    pickle.dump(vocab, outp, pickle.HIGHEST_PROTOCOL)

In [14]:
# for transformer data
filenames = [
    # save_as_memmap(trainX, './../data/transformer/trainX.memmap'),
    save_as_memmap(trainY, './../data/transformer/trainY.memmap'),
    save_as_memmap(trainY, './../data/transformer/seq_lengths.memmap')
]
with open('./../data/transformer/vocab.pkl', 'wb') as outp:
    pickle.dump(vocab, outp, pickle.HIGHEST_PROTOCOL)

check for lengths so we can load in files correctly

In [7]:
print(trainX.shape)

(1977299, 12, 8, 8)


In [8]:

print(trainY.shape)

(1977299,)


Now that we've finished processing the data, let's now load in the data (assuming we're starting from fresh)

In [None]:
# If you forget the shape
def find_working_shape(filename, dtype, max_first_dim, other_dims):

    # Try decreasing sizes from the max_first_dim until we find a working shape
    for first_dim in range(max_first_dim, 0, -1):
        shape_to_try = (first_dim,) + other_dims
        
        try:
            # Attempt to load the memmap with the current shape
            memmap_array = np.memmap(filename, dtype=dtype, mode='r', shape=shape_to_try)
            # If successful, return the array
            print(f"Successful shape: {shape_to_try}")
            return memmap_array
        except ValueError as e:
            # Catch the ValueError if the shape is not feasible, and try the next size
            continue
    
    raise ValueError("Could not find a working shape within the given bounds.")

dtype_trainX = np.int64
max_first_dim = 3038976  # This is your starting point, the upper bound of the first dimension
other_dims = (12, 8, 8)  # The other dimensions of the shape, assumed to be correct

trainX_filename = './../data/apr-jun/trainX.memmap'

# Try to find a working memmap shape
dummy = find_working_shape(trainX_filename, dtype_trainX, max_first_dim, other_dims)

for loading

In [86]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)


for transformer

In [87]:
# For trainX
dtype_trainX = np.int32  # or the correct dtype for your data
shape_trainX = (130265, 750)  # replace with the correct shape
trainX = load_memmap('./../data/transformer/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int32 # or the correct dtype for your data
shape_trainY = (130265, 7)  # replace with the correct shape
trainY = load_memmap('./../data/transformer/trainY.memmap', dtype_trainY, shape_trainY)

for jan (black)

In [9]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)

#for jan-march

# Assuming you know the dtype and shape, for example:
# For trainX_sequences
dtype_trainX_sequences = np.int64  # or the correct dtype for your data
shape_trainX_sequences = (1977299, 16)  # replace with the correct shape
trainX_sequences = load_memmap('./../data/black-data/jan/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# For trainX
dtype_trainX = np.int64  # or the correct dtype for your data
shape_trainX = (1977299, 12, 8, 8)  # replace with the correct shape
trainX = load_memmap('./../data/black-data/jan/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int64  # or the correct dtype for your data
shape_trainY = (1977299,)  # replace with the correct shape
trainY = load_memmap('./../data/black-data/jan/trainY.memmap', dtype_trainY, shape_trainY)

# For trainX_seqlengths
dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
shape_trainX_seqlengths = (1977299,)  # replace with the correct shape
trainX_seqlengths = load_memmap('./../data/black-data/jan/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)

for jan-march

In [None]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)

#for jan-march

# Assuming you know the dtype and shape, for example:
# For trainX_sequences
dtype_trainX_sequences = np.int64  # or the correct dtype for your data
shape_trainX_sequences = (3038976, 16)  # replace with the correct shape
trainX_sequences = load_memmap('./../data/jan-march/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# For trainX
dtype_trainX = np.int64  # or the correct dtype for your data
shape_trainX = (3038976, 12, 8, 8)  # replace with the correct shape
trainX = load_memmap('./../data/jan-march/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int64  # or the correct dtype for your data
shape_trainY = (3038976,)  # replace with the correct shape
trainY = load_memmap('./../data/jan-march/trainY.memmap', dtype_trainY, shape_trainY)

# For trainX_seqlengths
dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
shape_trainX_seqlengths = (3038976,)  # replace with the correct shape
trainX_seqlengths = load_memmap('./../data/jan-march/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)

for april-june

In [51]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)

# Assuming you know the dtype and shape, for example:
# For trainX_sequences
dtype_trainX_sequences = np.int64  # or the correct dtype for your data
shape_trainX_sequences = (2780980, 16)  # replace with the correct shape
trainX_sequences = load_memmap('./../data/apr-jun/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# For trainX
dtype_trainX = np.int64  # or the correct dtype for your data
shape_trainX = (2780980, 12, 8, 8)  # replace with the correct shape
trainX = load_memmap('./../data/apr-jun/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int64  # or the correct dtype for your data
shape_trainY = (2780980,)  # replace with the correct shape
trainY = load_memmap('./../data/apr-jun/trainY.memmap', dtype_trainY, shape_trainY)

# For trainX_seqlengths
dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
shape_trainX_seqlengths = (2780980,)  # replace with the correct shape
trainX_seqlengths = load_memmap('./../data/apr-jun/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)

let's combine memmaps

In [52]:
def concatenate_memmaps(file1, shape1, file2, shape2, dtype, result_file):
    # Calculate the new shape
    new_shape = (shape1[0] + shape2[0],) + shape1[1:]
    
    # Create a new memmap for the concatenated data
    concatenated = np.memmap(result_file, dtype=dtype, mode='w+', shape=new_shape)
    
    # Load the original memmaps
    memmap1 = np.memmap(file1, dtype=dtype, mode='r', shape=shape1)
    memmap2 = np.memmap(file2, dtype=dtype, mode='r', shape=shape2)
    
    # Copy data from the original memmaps to the new memmap
    concatenated[:shape1[0]] = memmap1[:]
    concatenated[shape1[0]:] = memmap2[:]
    
    # Flush changes to ensure they're written to disk
    concatenated.flush()
    
    return concatenated

# Example usage for trainX_sequences
dtype = np.int64
shape_jan_mar = (3038976, 16)
shape_apr_jun = (2780980, 16)
result_file_sequences = './../data/combined/trainX_sequences.memmap'

concatenated_sequences = concatenate_memmaps(
    './../data/jan-march/trainX_sequences.memmap', shape_jan_mar,
    './../data/apr-jun/trainX_sequences.memmap', shape_apr_jun,
    dtype, result_file_sequences)

# Print the shape of the concatenated memmap to verify
print(concatenated_sequences.shape)


(5819956, 16)


In [53]:
# trainX concatenation
shape_jan_mar_trainX = (3038976, 12, 8, 8)
shape_apr_jun_trainX = (2780980, 12, 8, 8)
result_file_trainX = './../data/combined/trainX.memmap'

concatenated_trainX = concatenate_memmaps(
    './../data/jan-march/trainX.memmap', shape_jan_mar_trainX,
    './../data/apr-jun/trainX.memmap', shape_apr_jun_trainX,
    dtype_trainX, result_file_trainX)


In [54]:
# trainY concatenation
shape_jan_mar_trainY = (3038976,)
shape_apr_jun_trainY = (2780980,)
result_file_trainY = './../data/combined/trainY.memmap'

concatenated_trainY = concatenate_memmaps(
    './../data/jan-march/trainY.memmap', shape_jan_mar_trainY,
    './../data/apr-jun/trainY.memmap', shape_apr_jun_trainY,
    dtype_trainY, result_file_trainY)


In [55]:
# trainX_seqlengths concatenation
shape_jan_mar_seqlengths = (3038976,)
shape_apr_jun_seqlengths = (2780980,)
result_file_seqlengths = './../data/combined/trainX_seqlengths.memmap'

concatenated_seqlengths = concatenate_memmaps(
    './../data/jan-march/trainX_seqlengths.memmap', shape_jan_mar_seqlengths,
    './../data/apr-jun/trainX_seqlengths.memmap', shape_apr_jun_seqlengths,
    dtype_trainX_seqlengths, result_file_seqlengths)


In [None]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)

# Assuming you know the dtype and shape, for example:
# For trainX_sequences
dtype_trainX_sequences = np.int64  # or the correct dtype for your data
shape_trainX_sequences = (5819956, 16)  # replace with the correct shape
trainX_sequences = load_memmap('./../data/combined/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# For trainX
dtype_trainX = np.int64  # or the correct dtype for your data
shape_trainX = (5819956, 12, 8, 8)  # replace with the correct shape
trainX = load_memmap('./../data/combined/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int64  # or the correct dtype for your data
shape_trainY = (5819956,)  # replace with the correct shape
trainY = load_memmap('./../data/combined/trainY.memmap', dtype_trainY, shape_trainY)

# For trainX_seqlengths
dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
shape_trainX_seqlengths = (5819956,)  # replace with the correct shape
trainX_seqlengths = load_memmap('./../data/combined/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)

Experiment: Let's make another form of data (only board states but give all the last 8 board states)

In [37]:
trainX_sequences, trainX, trainY, vocab = df_to_data_extended(grouped_df, fixed_window=True, fixed_window_size=8,sampling_rate=0.25)
trainX_sequences, trainX_seqlengths  = pad_sequences(trainX_sequences)

: 

In [None]:
trainX, trainX_sequences, trainX_seqlengths, trainY = np.array(trainX), np.array(trainX_sequences), np.array(trainX_seqlengths), np.array(trainY)

In [None]:
# Save each array as a memmap file
filenames = [
    save_as_memmap(trainX_sequences, './../data/jan-march/trainX_1_sequences.memmap'),
    save_as_memmap(trainX, './../data/jan-march/trainX_1.memmap'),
    save_as_memmap(trainY, './../data/jan-march/trainY_1.memmap'),
    save_as_memmap(trainX_seqlengths, './../data/jan-march/trainX_1_seqlengths.memmap')
]

with open('./../data/jan-march/vocab_1.pkl', 'wb') as outp:
    pickle.dump(vocab, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# # For trainX_sequences
# dtype_trainX_sequences = np.int64  # or the correct dtype for your data
# shape_trainX_sequences = (3038976, 16)  # replace with the correct shape
# trainX_sequences = load_memmap('./../data/jan-march/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# # For trainX
# dtype_trainX = np.int64  # or the correct dtype for your data
# shape_trainX = (3038976, 12, 8, 8)  # replace with the correct shape
# trainX = load_memmap('./../data/jan-march/trainX.memmap', dtype_trainX, shape_trainX)

# # For trainY
# dtype_trainY = np.int64  # or the correct dtype for your data
# shape_trainY = (3038976,)  # replace with the correct shape
# trainY = load_memmap('./../data/jan-march/trainY.memmap', dtype_trainY, shape_trainY)

# # For trainX_seqlengths
# dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
# shape_trainX_seqlengths = (3038976,)  # replace with the correct shape
# trainX_seqlengths = load_memmap('./../data/jan-march/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)