In [1]:
import pandas as pd 
import torch 

import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
import torch.optim as optim
from torch.optim.swa_utils import AveragedModel
import dask.dataframe as dd 
import numpy as np
import torch.nn as nn
import importlib
import pickle
import utils
import models
importlib.reload(utils)
from utils import *
importlib.reload(models)
from models import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
grouped_df = pd.read_csv("../data/haha-longer-apr-jun.csv")
print(grouped_df)

         game_id                                              moves  \
0       0Gws7Yna  e2e4 d7d5 e4d5 d8d5 b1c3 d5d8 d2d4 e7e6 g1f3 g...   
1       0JraozGk  e2e4 e7e6 g1f3 d7d5 e4d5 e6d5 b1c3 g8f6 h2h3 f...   
2       0b0GOLhX  e2e4 e7e5 d2d4 e5d4 d1d4 b8c6 d4e3 g8f6 b1c3 f...   
3       0cFU8tom  d2d4 d7d5 c2c4 g8f6 b1c3 d5c4 g1f3 f6d5 d1a4 d...   
4       0t0cIxvG  e2e4 g8f6 b1c3 e7e5 g1f3 b8c6 d2d4 f8b4 d4e5 f...   
...          ...                                                ...   
421830  zDJvok19  b1c3 d7d5 e2e3 e7e5 d2d4 e5e4 f2f3 c8f5 f3e4 d...   
421831  zOR2NZIz  e2e4 c7c5 f1c4 e7e6 d2d3 b8c6 c4b5 d8c7 b5c6 c...   
421832  zSvzvvNe  e2e4 c7c5 g1f3 e7e6 d2d4 c5d4 f3d4 a7a6 c2c4 g...   
421833  zfGfeqAo  e2e4 e7e6 g1f3 f8b4 c2c3 b4e7 d2d4 a7a6 e4e5 d...   
421834  zvkftdLZ  d2d4 g8f6 c1f4 g7g6 g1f3 f8g7 e2e3 d7d6 f1d3 e...   

        white_elo  black_elo  white_active  \
0            1529       1504          True   
1            1519       1506          True   
2        

In [13]:
trainX_sequences, fens, trainX, trainY, vocab = df_to_data(grouped_df, fixed_window=True, sampling_rate=0.25)
trainX_sequences, trainX_seqlengths  = pad_sequences(trainX_sequences)

In [None]:
trainX, trainX_sequences, fens, trainX_seqlengths, trainY = np.array(trainX), np.array(trainX_sequences), np.array(fens), np.array(trainX_seqlengths), np.array(trainY)

This is the last step of saving our data onto our disk so that it's ready to load as we train

In [8]:
def save_as_memmap(array, filename):
    # Determine the dtype and shape of the array to create a compatible memmap
    dtype = array.dtype
    shape = array.shape
    
    # Create a memmap file with write mode, which will also allocate the disk space
    memmap_array = np.memmap(filename, dtype=dtype, mode='w+', shape=shape)
    
    # Copy the data into the memmap array
    memmap_array[:] = array[:]
    
    # Flush memory changes to disk to ensure all data is written
    memmap_array.flush()

    # Return the path for confirmation
    return filename

# Save each array as a memmap file
filenames = [
    save_as_memmap(fens, './../data/jan-march/fens.memmap')
    save_as_memmap(trainX_sequences, './../data/jan-march/trainX_sequences.memmap'),
    save_as_memmap(trainX, './../data/jan-march/trainX.memmap'),
    save_as_memmap(trainY, './../data/jan-march/trainY.memmap'),
    save_as_memmap(trainX_seqlengths, './../data/jan-march/trainX_seqlengths.memmap')
]


In [19]:
with open('./../data/jan-march/vocab.pkl', 'wb') as outp:
    pickle.dump(vocab, outp, pickle.HIGHEST_PROTOCOL)

Now that we've finished processing the data, let's now load in the data (assuming we're starting from fresh)

In [3]:
# Function to load a memmap file
def load_memmap(filename, dtype, shape):
    # Load the memmap file with read-only mode
    return np.memmap(filename, dtype=dtype, mode='r', shape=shape)

# Assuming you know the dtype and shape, for example:
# For trainX_sequences
dtype_trainX_sequences = np.int64  # or the correct dtype for your data
shape_trainX_sequences = (3038976, 16)  # replace with the correct shape
trainX_sequences = load_memmap('./../data/jan-march/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# For trainX
dtype_trainX = np.int64  # or the correct dtype for your data
shape_trainX = (3038976, 12, 8, 8)  # replace with the correct shape
trainX = load_memmap('./../data/jan-march/trainX.memmap', dtype_trainX, shape_trainX)

# For trainY
dtype_trainY = np.int64  # or the correct dtype for your data
shape_trainY = (3038976,)  # replace with the correct shape
trainY = load_memmap('./../data/jan-march/trainY.memmap', dtype_trainY, shape_trainY)

# For trainX_seqlengths
dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
shape_trainX_seqlengths = (3038976,)  # replace with the correct shape
trainX_seqlengths = load_memmap('./../data/jan-march/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)

In [4]:
with open('./../data/jan-march/vocab.pkl', 'rb') as inp:
    vocab = pickle.load(inp)

In [5]:
class MultimodalDataset(Dataset):
    def __init__(self, sequences, boards, lengths, labels):
        self.sequences = sequences
        self.boards = boards
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.boards[idx], dtype=torch.float32),
            torch.from_numpy(self.sequences[idx], dtype=torch.long),
            torch.from_numpy(self.lengths[idx], dtype=torch.long),
            torch.from_numpy(self.labels[idx], dtype=torch.long),
        )
print(len(vocab.id_to_move.keys()))
print(torch.tensor(trainX[0]).shape)
print(trainX[0])

8121
torch.Size([12, 8, 8])
[[[0 0 0 0 0 0 0 0]
  [1 1 0 0 1 0 1 1]
  [0 0 1 0 0 1 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]

 [[1 0 0 0 0 0 0 1]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]

 [[0 1 0 0 0 0 1 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]

 [[0 0 0 0 0 1 0 0]
  [0 0 0 1 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]

 [[0 0 0 1 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]

 [[0 0 0 0 1 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0]]



Let's make another form of data (only board states but give all the last 8 board states)

In [37]:
trainX_sequences, trainX, trainY, vocab = df_to_data_extended(grouped_df, fixed_window=True, fixed_window_size=8,sampling_rate=0.25)
trainX_sequences, trainX_seqlengths  = pad_sequences(trainX_sequences)

: 

In [None]:
trainX, trainX_sequences, trainX_seqlengths, trainY = np.array(trainX), np.array(trainX_sequences), np.array(trainX_seqlengths), np.array(trainY)

In [None]:
# Save each array as a memmap file
filenames = [
    save_as_memmap(trainX_sequences, './../data/jan-march/trainX_1_sequences.memmap'),
    save_as_memmap(trainX, './../data/jan-march/trainX_1.memmap'),
    save_as_memmap(trainY, './../data/jan-march/trainY_1.memmap'),
    save_as_memmap(trainX_seqlengths, './../data/jan-march/trainX_1_seqlengths.memmap')
]

with open('./../data/jan-march/vocab_1.pkl', 'wb') as outp:
    pickle.dump(vocab, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
# # For trainX_sequences
# dtype_trainX_sequences = np.int64  # or the correct dtype for your data
# shape_trainX_sequences = (3038976, 16)  # replace with the correct shape
# trainX_sequences = load_memmap('./../data/jan-march/trainX_sequences.memmap', dtype_trainX_sequences, shape_trainX_sequences)

# # For trainX
# dtype_trainX = np.int64  # or the correct dtype for your data
# shape_trainX = (3038976, 12, 8, 8)  # replace with the correct shape
# trainX = load_memmap('./../data/jan-march/trainX.memmap', dtype_trainX, shape_trainX)

# # For trainY
# dtype_trainY = np.int64  # or the correct dtype for your data
# shape_trainY = (3038976,)  # replace with the correct shape
# trainY = load_memmap('./../data/jan-march/trainY.memmap', dtype_trainY, shape_trainY)

# # For trainX_seqlengths
# dtype_trainX_seqlengths = np.int64  # or the correct dtype for your data
# shape_trainX_seqlengths = (3038976,)  # replace with the correct shape
# trainX_seqlengths = load_memmap('./../data/jan-march/trainX_seqlengths.memmap', dtype_trainX_seqlengths, shape_trainX_seqlengths)