Import Games of Elo ~1100

In [3]:
import pandas as pd 
import torch 
from torch.utils.data import Dataset, DataLoader
import dask.dataframe as dd 

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Extract data from the dataframe at the given index
        # Format the data as required for your model
        data = self.dataframe[idx]
        data_tensor = torch.tensor(data, dtype=torch.float32)
        return data_tensor


In [4]:
# Import CSV File (from Maia: http://csslab.cs.toronto.edu/datasets/#monthly_chess_csv)
# The CSV has 151,072,060 rows
data_types ={'clock': 'float32',
       'cp': 'object',
       'opp_clock': 'float32',
       'opp_clock_percent': 'float32'}
df = dd.read_csv('../data/lichess_db_standard_rated_2019-01.csv', blocksize='64e6', dtype= data_types, low_memory=False)

# Filter out quick games (Bullet and HyperBullet) and take out moves that happened in the last XX seconds (this won't affect how many games we import but the # of moves we look at)
condition_time_control = ~df['time_control'].isin(['Blitz', 'Bullet', 'HyperBullet'])
condition_clock = df['clock'] > 60
condition_plays = df['num_ply'] < 80
filtered_df = df[condition_time_control & condition_clock & condition_plays]

# Select Relevant Columns
selected_columns = ['game_id','white_elo','black_elo','move','white_active','board']
filtered_df = filtered_df[selected_columns]

# Filter only games of Elo 1100-1199
filtered_df = filtered_df[(filtered_df['white_elo'].between(1100, 1199)) & (filtered_df['black_elo'].between(1100, 1199))]

# Group Same Games Together 
def aggregate_moves(group):
    moves = ' '.join(group['move'])  # Concatenate moves into a single string
    white_elo = group['white_elo'].iloc[0]  # Get the first white_elo
    black_elo = group['black_elo'].iloc[0]  # Get the first black_elo
    white_active = group['white_active'].iloc[0]  # Get the first num_ply
    board = group['board'].iloc[0]  # Get the first num_ply
    return pd.Series({'moves': moves, 'white_elo': white_elo, 'black_elo': black_elo, 'white_active': white_active, 'board': board})

grouped_df = filtered_df.groupby('game_id').apply(aggregate_moves, meta={'moves': 'str', 'white_elo': 'int', 'black_elo': 'int', 'white_active': 'str', 'board': 'str'}).compute()

# This gives us 99,300 Games when we don't filter games with more than 80 half-moves
print(grouped_df)

                                                      moves  white_elo  \
game_id                                                                  
0TH6rnrg  e2e4 e7e5 d2d4 g8f6 d4e5 f6e4 c2c3 d8e7 g1f3 d...       1137   
0VoR1Pz3  d2d4 e7e6 c1f4 g8f6 e2e3 d7d6 f1d3 c7c5 d4c5 d...       1197   
0nTYGMRx  e2e4 e7e5 g1f3 f8c5 f3e5 g8f6 f1c4 e8g8 d2d3 d...       1185   
2BP6vWMj  d2d4 g8f6 b1c3 d7d6 d4d5 e7e6 e2e4 e6d5 e4d5 b...       1138   
32zLcujS  d2d4 d7d5 c2c4 c7c6 e2e3 g8f6 f1e2 b8d7 c4d5 c...       1131   
...                                                     ...        ...   
waNjx6pa  d2d4 d7d5 b1c3 e7e6 c1f4 f8d6 f4d6 d8d6 c3b5 d...       1140   
wmoOaDoG  e2e4 e7e5 g1f3 b8c6 d2d4 e5d4 f3d4 f8c5 f1b5 c...       1187   
xaKGPR97  e2e4 e7e5 d1h5 d7d5 h5e5 c8e6 f1b5 c7c6 b5a4 b...       1108   
yCNWC8pw  e2e4 e7e5 g1f3 b8c6 f1b5 d7d6 e1g1 g8f6 b1c3 a...       1147   
zEtSHtQz  e2e3 e7e5 b1c3 f8b4 a2a3 b4c3 b2c3 d7d6 c1b2 b...       1100   

          black_elo  white_active  \


In [5]:
print(grouped_df)

                                                      moves  white_elo  \
game_id                                                                  
0TH6rnrg  e2e4 e7e5 d2d4 g8f6 d4e5 f6e4 c2c3 d8e7 g1f3 d...       1137   
0VoR1Pz3  d2d4 e7e6 c1f4 g8f6 e2e3 d7d6 f1d3 c7c5 d4c5 d...       1197   
0nTYGMRx  e2e4 e7e5 g1f3 f8c5 f3e5 g8f6 f1c4 e8g8 d2d3 d...       1185   
2BP6vWMj  d2d4 g8f6 b1c3 d7d6 d4d5 e7e6 e2e4 e6d5 e4d5 b...       1138   
32zLcujS  d2d4 d7d5 c2c4 c7c6 e2e3 g8f6 f1e2 b8d7 c4d5 c...       1131   
...                                                     ...        ...   
waNjx6pa  d2d4 d7d5 b1c3 e7e6 c1f4 f8d6 f4d6 d8d6 c3b5 d...       1140   
wmoOaDoG  e2e4 e7e5 g1f3 b8c6 d2d4 e5d4 f3d4 f8c5 f1b5 c...       1187   
xaKGPR97  e2e4 e7e5 d1h5 d7d5 h5e5 c8e6 f1b5 c7c6 b5a4 b...       1108   
yCNWC8pw  e2e4 e7e5 g1f3 b8c6 f1b5 d7d6 e1g1 g8f6 b1c3 a...       1147   
zEtSHtQz  e2e3 e7e5 b1c3 f8b4 a2a3 b4c3 b2c3 d7d6 c1b2 b...       1100   

          black_elo  white_active  \


In [6]:
import numpy as np

Now that we have our raw data, we need to be able to make sense of chess moves. Meaning, we're transforming our entire world from chess moves into numerical tokens that will serve as indices into unique embeddings

In [7]:
# First, generate a mapping from each move to a unique embedding. In order to index into our matrix of 
# embeddings (matrix format so it's something we can tune), we'll also want a mapping from each move to a unique ID
class Vocabulary:
    def __init__(self):
        self.move_to_id = {"<UNK>": 0}
        self.id_to_move = {0: "<UNK>"}
        self.index = 1  # Start indexing from 1

    def add_move(self, move):
        if move not in self.move_to_id:
            self.move_to_id[move] = self.index
            self.id_to_move[self.index] = move
            self.index += 1

    def get_id(self, move):
        return self.move_to_id.get(move, self.move_to_id["<UNK>"])

    def get_move(self, id):
        return self.id_to_move.get(id, self.id_to_move[0])
    
vocab = Vocabulary()
for i,game in enumerate(grouped_df['moves']):
    moves = game.split()
    for move in moves: 
        vocab.add_move(move)

# We can just use nn.Embedding later when we pass the model a sequence of indices, but this is if we ever want to pre-train and have access to the matrix we've trained
def get_embedding_matrix(vocab, d_embed):
    n_embed = len(vocab.move_to_id)
    return np.random.normal(0, 1, (n_embed, d_embed))
# embedding_matrix = get_embedding_matrix(vocab, 64)


In [8]:
# Now let's turn our data into sequences of indices instead of chess moves

# Function to convert games to a list of lists in which each list represents the move sequence of a game
def df_to_list_of_games(df, vocab_map):
    sequences = []
    for game in df['moves']:
        moves = game.split()
        seq = [vocab_map.get_id(move) for move in moves]
        sequences.append(seq)
    return sequences

def df_to_subsequences_and_labels(df, vocab_map):
    subsequences = []
    next_moves = []

    for game in df['moves']:
        moves = game.split()
        encoded_moves = [vocab_map.get_id(move) for move in moves]

        for i in range(len(encoded_moves)-1):
            subseq = encoded_moves[0:i+1]
            label = encoded_moves[i+1]
            subsequences.append(subseq)
            next_moves.append(label)

    return subsequences, next_moves

# Function to pad move sequences & get their sequence lengths
def pad_sequences(sequences, max_len=None, pad_id=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded_sequences = np.full((len(sequences), max_len), pad_id, dtype=int)
    sequence_lengths = np.zeros(len(sequences), dtype=int)
    for i, seq in enumerate(sequences):
        length = len(seq)
        padded_sequences[i, :length] = seq[:length]
        sequence_lengths[i] = length
    return padded_sequences, sequence_lengths

class SequenceDataset(Dataset):
    def __init__(self, sequences, lengths, labels):
        self.sequences = sequences
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.lengths[idx], self.labels[idx]

trainX, trainY = df_to_subsequences_and_labels(grouped_df, vocab)
trainX, trainX_seqlengths  = pad_sequences(trainX)

In [9]:
print(len(vocab.id_to_move.keys()))
print(len(trainX[140]))

1870
78


In [10]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [11]:
#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
"""Note: The input to the Embedding module is a list of indices, and the output is the corresponding word embeddings."""
# Bi-LSTM Model for PyTorch
class RNNModel(nn.Module):
    def __init__(self, vocab, d_embed, d_hidden, d_out, dropout = 0.5, num_layers = 2, bidirectional = False, embedding_matrix = None):
        super(RNNModel, self).__init__()
        self.embeddings = nn.Embedding(len(vocab.move_to_id), d_embed, padding_idx=0)
        # self.embeddings = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(d_embed, d_hidden, dropout = dropout, bidirectional=bidirectional, num_layers = num_layers)
        self.fc = nn.Sequential(
            nn.Linear(2 * d_hidden,d_out)
        )

    def forward(self, x, seq_lengths):
        x = self.embeddings(x)
        # Sort x and seq_lengths in descending order
        # This is required for packing the sequence
        seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
        x = x[perm_idx]
        # Pack the sequence
        packed_input = pack_padded_sequence(x, seq_lengths, batch_first=True)
        # Pass the packed sequence through the LSTM
        packed_output, (hidden, cell) = self.lstm(packed_input)

        # Unpack the sequence
        output, _ = pad_packed_sequence(packed_output, batch_first=True,total_length = x.size()[1])
        _, unperm_idx = perm_idx.sort(0)
        #unperm_idx = unperm_idx.to(self.device)
        output = output.index_select(0, unperm_idx)
        #This takes all the outputs across the cells
        mean_pooled = torch.mean(output, dim=1)
        output = torch.cat((mean_pooled,hidden[-1]),dim=1)
        output = self.fc(output)
        return output

In [16]:
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset
import torch.optim as optim
from torch.optim.swa_utils import AveragedModel

Functions for training

In [14]:
def train_rnn(device, model, train_loader, val_loader, criterion, optimizer, num_epochs, vocab):
    train_loss_values = []
    train_error = []
    val_loss_values = []
    val_error = []
    swa_model = AveragedModel(model)
    swa_start = 1
    for epoch in range(num_epochs):
        train_correct = 0
        train_total = 0
        training_loss = 0.0
        # Training
        model.train()
        count = 0
        for sequences, lengths, labels in train_loader:
            count += 1
            sequences, lengths, labels = sequences.to(device), lengths.to(device), labels.to(device)
            # Forward Pass
            output = model(sequences, lengths)
            loss = criterion(output, labels)
            # Backpropogate & Optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # For logging purposes
            training_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            if count % 1000 == 0:
                print(f'Epoch {epoch+1}, Batch: {count}| Training Loss: {training_loss/count}')
        if epoch >= swa_start:
            swa_model.update_parameters(model)
        torch.optim.swa_utils.update_bn(train_loader, swa_model)
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        validation_loss = 0.0
        if val_loader is not None:
            with torch.no_grad():
                for sequences, lengths, labels in val_loader:
                    sequences, lengths, labels = sequences.to(device), lengths.to(device), labels.to(device)
                    outputs = model(sequences, lengths)
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
                    loss = criterion(outputs, labels)
                    validation_loss += loss.item()
            val_loss_values.append(validation_loss / len(val_loader))
            val_error.append(100-100*val_correct/val_total)
        # Log Model Performance  
        train_loss_values.append(training_loss)
        train_error.append(100-100*train_correct/train_total)
        print(f'Epoch {epoch+1}, Training Loss: {training_loss/len(train_loader)}, Validation Error: {val_error[-1]}, Training Error: {train_error[-1]}')
        for op_params in optimizer.param_groups:
            op_params['lr'] = op_params['lr'] * 0.5
    return train_error,train_loss_values, val_error, val_loss_values, swa_model

Now let's train

In [29]:
dataset = SequenceDataset(trainX, trainX_seqlengths, trainY)
# Calculate split sizes
total_size = len(dataset)

# 0.2 gives us ~1,000,000 training samples (when we didn't filter games with more than 40 moves & filtered out moves with less than 30 seconds left)
train_size = int(0.4 * total_size)
val_size = int(train_size/10)

# Create subsets for training and validation
train_dataset = Subset(dataset, range(0, train_size))
val_dataset = Subset(dataset, range(train_size, train_size+val_size))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [30]:
train_size

1288999

In [32]:
# Initialize model, loss function, and optimizer
d_hidden = 128
d_embed = 128
NUM_EPOCHS = 5
d_out = len(vocab.id_to_move.keys())
model = RNNModel(vocab,d_embed,d_hidden,d_out,num_layers=2) 
model = model.to(device)
criterion = nn.CrossEntropyLoss()
lr = 2e-3
weight_decay=0
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))

984142


In [25]:
# Train the model
train_error,train_loss_values, val_error, val_loss_value,swa_model = train_rnn(device, model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, vocab)

# Plot the training error
plt.figure(figsize=(10, 5))
plt.plot(val_error, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Validation Error')
plt.legend()
plt.show()
plt.savefig('validation_error_model_rnn.png')  # This will save the plot as an image

Epoch 1, Batch: 1000| Training Loss: 6.182356120109558
Epoch 1, Batch: 2000| Training Loss: 6.177685140609741
Epoch 1, Batch: 3000| Training Loss: 6.1761104799906414
Epoch 1, Batch: 4000| Training Loss: 6.172615287899971
Epoch 1, Batch: 5000| Training Loss: 6.174867381381988
Epoch 1, Batch: 6000| Training Loss: 6.174857710599899
Epoch 1, Batch: 7000| Training Loss: 6.174077647686005
Epoch 1, Batch: 8000| Training Loss: 6.172386170566082
Epoch 1, Batch: 9000| Training Loss: 6.167981874889797
Epoch 1, Batch: 10000| Training Loss: 6.166811403226853
Epoch 1, Batch: 11000| Training Loss: 6.165029389554804
Epoch 1, Batch: 12000| Training Loss: 6.164951903780302
Epoch 1, Batch: 13000| Training Loss: 6.165426612083729
Epoch 1, Batch: 14000| Training Loss: 6.164278510774885
Epoch 1, Batch: 15000| Training Loss: 6.165609864234924
Epoch 1, Batch: 16000| Training Loss: 6.164390026688576
Epoch 1, Batch: 17000| Training Loss: 6.163982957026538
Epoch 1, Batch: 18000| Training Loss: 6.162472054004669


KeyboardInterrupt: 