# Experimenting with transformers
Transformers remain as a promising replacement of RNNs due to their parallelizability. However, RNNs are unique in their hidden state which tends to be uniquely useful for games. 

Note: This notebook is greatly outdated. 

Note: More data is needed to test the usefulness of transformers.

In [1]:
import pandas as pd 
import torch 
from torch.utils.data import Dataset
import dask.dataframe as dd 
import numpy as np
import torch.nn as nn
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
grouped_df = pd.read_csv('haha_longer.csv')

Now that we have our raw data, we need to be able to make sense of chess moves. Meaning, we're transforming our entire world from chess moves into numerical tokens that will serve as indices into unique embeddings

In [3]:
# First, generate a mapping from each move to a unique embedding. In order to index into our matrix of 
# embeddings (matrix format so it's something we can tune), we'll also want a mapping from each move to a unique ID
class Vocabulary:
    def __init__(self):
        self.move_to_id = {"<UNK>": 0}
        self.id_to_move = {0: "<UNK>"}
        self.index = 1  # Start indexing from 1

    def add_move(self, move):
        if move not in self.move_to_id:
            self.move_to_id[move] = self.index
            self.id_to_move[self.index] = move
            self.index += 1

    def get_id(self, move):
        return self.move_to_id.get(move, self.move_to_id["<UNK>"])

    def get_move(self, id):
        return self.id_to_move.get(id, self.id_to_move[0])

# We can just use nn.Embedding later when we pass the model a sequence of indices, but this is if we ever want to pre-train and have access to the matrix we've trained
def get_embedding_matrix(vocab, d_embed):
    n_embed = len(vocab.move_to_id)
    return np.random.normal(0, 1, (n_embed, d_embed))
# embedding_matrix = get_embedding_matrix(vocab, 64)
    
# Now let's turn our data into sequences of indices instead of chess moves

# Function to convert games to a list of lists in which each list represents the move sequence of a game
def df_to_list_of_games(df, vocab_map):
    sequences = []
    for game in df['moves']:
        moves = game.split()
        seq = [vocab_map.get_id(move) for move in moves]
        sequences.append(seq)
    return sequences

def df_to_subsequences_and_labels(df, vocab_map):
    subsequences = []
    next_moves = []

    for game in df['moves']:
        moves = game.split()
        encoded_moves = [vocab_map.get_id(move) for move in moves]

        for i in range(len(encoded_moves)-1):
            subseq = encoded_moves[0:i+1]
            label = encoded_moves[i+1]
            subsequences.append(subseq)
            next_moves.append(label)

    return subsequences, next_moves

# Function to pad move sequences & get their sequence lengths
def pad_sequences(sequences, max_len=None, pad_id=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    padded_sequences = np.full((len(sequences), max_len), pad_id, dtype=int)
    sequence_lengths = np.zeros(len(sequences), dtype=int)
    for i, seq in enumerate(sequences):
        length = len(seq)
        padded_sequences[i, :length] = seq[:length]
        sequence_lengths[i] = length
    return padded_sequences, sequence_lengths

class SequenceDataset(Dataset):
    def __init__(self, sequences, lengths, labels):
        self.sequences = sequences
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.lengths[idx], self.labels[idx]

In [4]:
vocab = Vocabulary()
for i,game in enumerate(grouped_df['moves']):
    moves = game.split()
    for move in moves: 
        vocab.add_move(move)
        
trainX, trainY = df_to_subsequences_and_labels(grouped_df, vocab)
trainX, trainX_seqlengths  = pad_sequences(trainX)

print(len(vocab.id_to_move.keys()))
print(len(trainX[140]))

In [72]:
class TransformerModel(nn.Module):
    def __init__(self, vocab, d_embed, nhead, d_hidden, d_out, num_layers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(len(vocab.move_to_id), d_embed)
        self.pos_encoder = PositionalEncoding(d_embed, dropout)
        transformer_layers = nn.TransformerEncoderLayer(d_embed, nhead, d_hidden, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layers, num_layers)
        self.d_embed = d_embed
        self.decoder = nn.Linear(d_embed, d_out)

    def forward(self, x, seq_lengths):
        mask = self.create_mask(seq_lengths, x.size(1))
        x = self.embedding(x) * math.sqrt(self.d_embed)
        x = self.pos_encoder(x)
        output = self.transformer_encoder(x, src_key_padding_mask=mask)
        output = output.mean(dim=1)
        #last_token_output = output[:, -1, :]  # shape (batch_size, d_embed)
        output = self.decoder(output)
        return output
    
    def create_mask(self, seq_lengths, max_len):
        batch_size = seq_lengths.size(0)
        # Create a mask of shape (batch_size, max_len) with all zeros
        mask = torch.zeros(batch_size, max_len)

        # Iterate over each element in seq_lengths to set `-inf` for padding
        for i in range(batch_size):
            length = seq_lengths[i]
            mask[i, length:] = float('-inf')

        return mask



class PositionalEncoding(nn.Module):
    def __init__(self, d_embed, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_embed)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_embed, 2).float() * (-math.log(10000.0) / d_embed))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1)].transpose(0, 1)
        return self.dropout(x)

In [6]:
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset
import torch.optim as optim
from torch.optim.swa_utils import AveragedModel

Functions for training

In [70]:
# Function to calculate top-3 accuracy
def top_3_accuracy(y_true, y_pred):
    top3 = torch.topk(y_pred, 3, dim=1).indices
    correct = top3.eq(y_true.view(-1, 1).expand_as(top3))
    return correct.any(dim=1).float().mean().item()

def train_transformer(device, model, train_loader, val_loader, criterion, optimizer, num_epochs, learn_decay):
    train_loss_values = []
    train_error = []
    val_loss_values = []
    val_error = []
    val_3_accuracy = []
    swa_model = AveragedModel(model)
    swa_start = 1
    for epoch in range(num_epochs):
        train_correct = 0
        train_total = 0
        training_loss = 0.0
        # Training
        model.train()
        count = 0
        for sequences, lengths, labels in train_loader:
            count += 1
            sequences, lengths, labels = sequences.to(device), lengths.to(device), labels.to(device)
            # Forward Pass
            output = model(sequences, lengths)
            loss = criterion(output, labels)
            # Backpropogate & Optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # For logging purposes
            training_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            if count % 1000 == 0:
                print(f'Epoch {epoch+1}, Batch: {count}| Training Loss: {training_loss/count}')
        if epoch >= swa_start:
            swa_model.update_parameters(model)
        torch.optim.swa_utils.update_bn(train_loader, swa_model)
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        validation_loss = 0.0
        if val_loader is not None:
            with torch.no_grad():
                val_correct = 0
                val_total = 0
                val_top3_correct = 0
                validation_loss = 0

                for sequences, lengths, labels in val_loader:
                    sequences, lengths, labels = sequences.to(device), lengths.to(device), labels.to(device)
                    outputs = model(sequences, lengths)
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
                    val_top3_correct += top_3_accuracy(labels, outputs) * labels.size(0)
                    loss = criterion(outputs, labels)
                    validation_loss += loss.item()

                val_loss_values.append(validation_loss / len(val_loader))
                val_accuracy = 100 * val_correct / val_total
                val_top3_accuracy = 100 * val_top3_correct / val_total
                val_error.append(100 - val_accuracy)
                val_3_accuracy.append(val_top3_accuracy)

        # Log Model Performance  
        train_loss_values.append(training_loss)
        train_error.append(100-100*train_correct/train_total)
        print(f'Epoch {epoch+1}, Training Loss: {training_loss/len(train_loader)}, Validation Error: {val_error[-1]}, Validation Top-3 Accuracy: {val_3_accuracy[-1]}, Training Error: {train_error[-1]}')
        for op_params in optimizer.param_groups:
            op_params['lr'] = op_params['lr'] * learn_decay
    return train_error,train_loss_values, val_error, val_loss_values, swa_model

Now let's train

In [8]:
dataset = SequenceDataset(trainX, trainX_seqlengths, trainY)
# Calculate split sizes
total_size = len(dataset)

# 0.2 gives us ~1,000,000 training samples (when we didn't filter games with more than 40 moves & filtered out moves with less than 30 seconds left)
train_size = int(0.4 * total_size)
val_size = int(train_size/10)

# Create subsets for training and validation
train_dataset = Subset(dataset, range(0, train_size))
val_dataset = Subset(dataset, range(train_size, train_size+val_size))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [48]:
train_size

1288999

# Experiments

Experiment 1

In [73]:
# We're scaling the model size so let's bring in more data as well
train_size = int(0.75 * total_size)
val_size = int(total_size/20)

# Create subsets for training and validation
train_dataset = Subset(dataset, range(0, train_size))
val_dataset = Subset(dataset, range(train_size, train_size + val_size))
print(train_size)

# Reload the data with particular batch size
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
d_hidden = 128
d_embed = 128
NUM_EPOCHS = 5
d_out = len(vocab.id_to_move.keys())
nhead = 2
model = TransformerModel(vocab, d_embed, nhead, d_hidden, d_out, num_layers=2,dropout=0.1) 
model = model.to(device)
criterion = nn.CrossEntropyLoss()
lr = 2e-3
weight_decay=1e-7
learn_decay = 0.65 # This causes the LR to be 2e-5 by epoch 10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))

2416874
679758


In [74]:
# Train the model
train_error,train_loss_values, val_error, val_loss_value,swa_model = train_transformer(device, model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, learn_decay)

# Plot the training error
plt.figure(figsize=(10, 5))
plt.plot(val_error, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Validation Error')
plt.legend()
plt.show()
plt.savefig('validation_error_model_rnn.png')  # This will save the plot as an image

Epoch 1, Batch: 1000| Training Loss: 6.000442171096802
Epoch 1, Batch: 2000| Training Loss: 5.922892831325531
Epoch 1, Batch: 3000| Training Loss: 5.880267251491547
Epoch 1, Batch: 4000| Training Loss: 5.8274934012889865
Epoch 1, Batch: 5000| Training Loss: 5.766720522594452
Epoch 1, Batch: 6000| Training Loss: 5.710842233260473
Epoch 1, Batch: 7000| Training Loss: 5.662404630592891
Epoch 1, Batch: 8000| Training Loss: 5.619151238799095


In [None]:
# We're scaling the model size so let's bring in more data as well
train_size = int(0.75 * total_size)
val_size = int(total_size/20)

# Create subsets for training and validation
train_dataset = Subset(dataset, range(0, train_size))
val_dataset = Subset(dataset, range(train_size, train_size + val_size))
print(train_size)

# Reload the data with particular batch size
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
d_hidden = 128
d_embed = 128
NUM_EPOCHS = 5
d_out = len(vocab.id_to_move.keys())
nhead = 2
model = TransformerModel(vocab, d_embed, nhead, d_hidden, d_out, num_layers=2,dropout=0.1) 
model = model.to(device)
criterion = nn.CrossEntropyLoss()
lr = 1e-3
weight_decay=1e-7
learn_decay = 0.7 # This causes the LR to be 2e-5 by epoch 10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))

In [None]:
# Train the model
train_error,train_loss_values, val_error, val_loss_value,swa_model = train_transformer(device, model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, learn_decay)

# Plot the training error
plt.figure(figsize=(10, 5))
plt.plot(val_error, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Validation Error')
plt.legend()
plt.show()
plt.savefig('validation_error_model_rnn.png')  # This will save the plot as an image