In [55]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
from models.CustomNet import CustomNet
from tqdm import tqdm

In [56]:
def resize_word_list(word_list, max_len=5):
    """Resize a list of words into a fixed length, truncating or padding as necessary"""
    resized_list = []
    for word in word_list:
        # Truncate or pad word to fixed length
        if len(word) > max_len:
            resized_word = word[:max_len]
        else:
            resized_word = word + '0'*(max_len-len(word))
        resized_list.append(resized_word)
    return resized_list

In [57]:
# Define dataset class
word_column = 'Word'
input_columns = ['Month', 'Day', 'WeekNum', 'Contest number']
target_columns = ['Number of reported results', 'Number in hard mode', '1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)']
class TrainingDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        word = row['Word']

        # Resize every word into 5 tokens, if word is longer than 5 tokens, truncate it, if word is shorter than 5 tokens, pad it with 0
        word = resize_word_list([word])[0]

        month = row['Month']
        day = row['Day']
        weeknum = row['WeekNum']
        contest_num = row['Contest number']
        targets = row[['Number of reported results', 'Number in hard mode', '1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)']].values.astype(float)

        # Tokenize the input word
        input_ids = self.tokenizer.encode(word, add_special_tokens=True, return_tensors='pt', max_length=5, truncation=True)
        # fixed length of 5 tokens
        if input_ids.shape[1] < 5:
            input_ids = torch.cat((input_ids, torch.zeros((1, 5-input_ids.shape[1])).long()), dim=1)

        # attention_mask = torch.ones(5)
        attention_mask = torch.ones(input_ids.shape)

        # Combine inputs into a dictionary
        inputs = {
            'input_ids': input_ids.squeeze(0),
            'attention_mask': attention_mask.squeeze(0),
            'input_numbers': torch.tensor([[month, day, weeknum, contest_num]]).float()
        }

        # # Print inputs and targets shapes
        # print(f"Input IDs shape: {inputs['input_ids'].shape}")
        # print(f"Attention mask shape: {inputs['attention_mask'].shape}")
        # print(f"Input numbers shape: {inputs['input_numbers'].shape}")
        # print(f"Targets shape: {targets.shape}")


        return inputs, targets

In [58]:
# Define function to train model
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    epoch_count = 0

    with tqdm(total=len(dataloader)) as pbar:
        for batch_inputs, batch_targets in dataloader:
            # Move batch to device
            batch_inputs = {key: value.to(device) for key, value in batch_inputs.items()}
            batch_targets = batch_targets.to(device).float()

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            batch_predictions = model(**batch_inputs)
            loss = criterion(batch_predictions, batch_targets)

            # Backward pass and update weights
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            epoch_count += 1
            pbar.update(1)
            pbar.set_description(f"Batch loss: {loss.item():.4f}")

            if epoch_count % 10 == 0:
                torch.save(model.state_dict(), "checkpoints/"+"wordle"+"_"+str(epoch_count)+"batches")

    return total_loss / len(dataloader)

In [59]:
# Load data and tokenizer
data = pd.read_csv('Data_V1.2.csv')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Resize every word into 5 tokens, if word is longer than 5 tokens, truncate it, if word is shorter than 5 tokens, pad it with 0
data[word_column] = resize_word_list(data[word_column])

# Create dataset and dataloader
dataset = TrainingDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# Create model and move to device
model = CustomNet('roberta-base', num_numbers=4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
# Train model
num_epochs = 1000
for epoch in range(num_epochs):
    # Train for one epoch and get average loss
    train_loss = train(model, dataloader, optimizer, criterion, device)

    # Print the loss for the epoch
    print(f"Epoch {epoch+1} loss: {train_loss:.4f}")

  0%|          | 0/180 [00:00<?, ?it/s]


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [None]:
# Save model
torch.save(model.state_dict(), "checkpoints/"+"wordle"+"_"+str(num_epochs)+"final")