In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.nn import L1Loss
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder
import itertools
import random
from tqdm import trange
from torch.nn.utils.rnn import pad_sequence
from scipy.special import comb
from transform_encoder import InputEmbedding, PositionEmbedding, LayerNormalization
from transform_encoder import  FeedForwardSec, MultiHeadAttentionSec, SkipConnection,EncoderSec, Encoder
# import ipdb

In [2]:
cuda_available = torch.cuda.is_available()

# If CUDA is available, set the default device to GPU
if cuda_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [3]:
full_dictionary_location = "words_250000_train.txt"

def build_dictionary(dictionary_file_location):
    text_file = open(dictionary_file_location,"r")
    full_dictionary = text_file.read().splitlines()
    text_file.close()
    return list(full_dictionary)

words_list = build_dictionary(full_dictionary_location)[:]
chars = sorted(list(set(''.join(words_list) + "_")))
vocab_size = len(chars)
chars = chars[1:]
chars.append("_")
alphabet_chars = sorted(list(set(''.join(words_list))))
alphabet_size = len(alphabet_chars)
char_to_num_input = {char: i for i, char in enumerate(chars)}
char_to_num_output = {char: i for i, char in enumerate(alphabet_chars)}

In [4]:
from itertools import combinations
from tqdm import tqdm
import random
from math import comb

class WordsDataset(Dataset):
    def __init__(self, words, char_to_index, max_missing=3):
        self.words = words
        self.char_to_index = char_to_index
        self.vocab_size = len(char_to_index) - 1  # Exclude "_" from target vocab size
        self.max_missing = max_missing
        self.samples = self.prepare_dataset()

    def prepare_dataset(self):
        samples = []
        max_combinations_per_word = 1000
        for word in tqdm(self.words, desc="Preparing dataset"):
            unique_chars = list(set(word))  # Get unique characters to handle repeats consistently
            word_len = len(word)

            if word_len == 1:
                encoded_sample, target = self.create_sample_target(word, [0])
                samples.append((encoded_sample, target, [0]))
            else:
                generated_combinations = set()
                # Attempt to generate up to max_combinations_per_word unique combinations
                for _ in range(max_combinations_per_word):
                    num_chars_to_mask = random.randint(1, min(len(unique_chars), max_combinations_per_word))
                    chars_to_mask = random.sample(unique_chars, num_chars_to_mask)

                    # Create a unique identifier for this combination of characters
                    combination_id = ''.join(sorted(chars_to_mask))
                    if combination_id not in generated_combinations:
                        indices_to_mask = [i for i, char in enumerate(word) if char in chars_to_mask]
                        if len(indices_to_mask) > word_len * 0.7:
                            continue  # Skip this combination

                        encoded_sample, target = self.create_sample_target(word, indices_to_mask)
                        samples.append((encoded_sample, target, indices_to_mask))
                        generated_combinations.add(combination_id)

        return samples

    def create_sample_target(self, word, missing_indices):
        sample = [self.char_to_index[char] if i not in missing_indices else self.char_to_index["_"] for i, char in enumerate(word)]
        target = [self.char_to_index[word[i]] for i in range(len(word))]
        return np.array(sample), np.array(target)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample, target, missing_indices = self.samples[idx]
        sample_tensor = torch.tensor(sample, dtype=torch.long)
        target_tensor = torch.tensor(target, dtype=torch.long)
        exist_mask = torch.zeros(len(sample), dtype=torch.bool)
        for i in missing_indices:
            exist_mask[i] = 1
        return sample_tensor, target_tensor, exist_mask


In [5]:
full_dataset = WordsDataset(words_list, char_to_num_input)


Preparing dataset: 100%|██████████| 227300/227300 [34:17<00:00, 110.50it/s]  


In [6]:
# Define the size of the splits
train_size = int(0.95 * len(full_dataset))
val_size = len(full_dataset) - train_size

# Randomly split the dataset into training and validation datasets
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

In [9]:
import pickle

def save_dataset(dataset, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(dataset, f)

# Assuming `dataset` is an instance of WordsDataset
save_dataset(full_dataset, 'words_dataset.pkl')

In [5]:
import pickle
def load_dataset(file_path):
    with open(file_path, 'rb') as f:
        dataset = pickle.load(f)
    return dataset

# Load the dataset
full_dataset = load_dataset('words_dataset.pkl')

In [None]:
class Predictor(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Predictor, self).__init__()
        # Linear layer to project from d_model to vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self, x, exist_mask=None):
        # Apply the linear layer to project from d_model to vocab_size
        logits = self.linear(x)
        
        # If exist_mask is provided and needs to be used in a specific way, handle accordingly
        # For example, if you need to focus on masked positions for a specific computation
        # Adjust the handling of exist_mask here as per the requirements
        
        # Without modifying logits based on exist_mask, simply return log softmax of logits
        return F.log_softmax(logits, dim=-1)
    
    
class ImprovedPredictor(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(ImprovedPredictor, self).__init__()
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self, x, exist_mask=None):
        # Apply the linear layer to project from d_model to vocab_size
        logits = self.linear(x)

        if exist_mask is not None:
            # Ensure exist_mask is broadcastable to logits shape
            exist_mask = exist_mask.unsqueeze(-1).expand_as(logits)

            # Apply a large negative value to positions not to be predicted (where exist_mask == 0)
            # This effectively removes them from consideration in the max pooling step
            modified_logits = logits.masked_fill_(exist_mask == 0, -1e9)
            
            # Apply max pooling across the sequence length dimension (dim=1)
            # Keepdim=True maintains the original number of dimensions
            pooled_logits, _ = torch.max(modified_logits, dim=1, keepdim=True)
            
            # Normalize pooled logits across the vocabulary dimension to get probabilities
            probs = F.softmax(pooled_logits, dim=-1)
        else:
            # If no mask is provided, compute softmax probabilities directly from logits
            probs = F.softmax(logits, dim=-1)

        return probs
    
    
class RefinedPredictor(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(RefinedPredictor, self).__init__()
        # Linear layer to project from d_model to vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self, x):
        # Apply the linear layer to get logits
        logits = self.linear(x)

        # Apply max pooling across the sequence length dimension (dim=1)
        # This step leverages information from the entire sequence
        pooled_logits, _ = torch.max(logits, dim=1, keepdim=True)

        # Reshape pooled logits to ensure the output shape is [batch_size, vocab_size]
        # This is crucial for compatibility with the expected output format
        reshaped_pooled_logits = pooled_logits.squeeze(1)

        # Normalize pooled logits across the vocabulary dimension to get probabilities
        probs = F.softmax(reshaped_pooled_logits, dim=-1)

        return probs
    
class RefinedPredictor_v2(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(RefinedPredictor_v2, self).__init__()
        # Linear layer to project from d_model to vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self, x):
        # Apply the linear layer to get logits
        logits = self.linear(x)

        # Apply average pooling across the sequence length dimension (dim=1)
        # This step aggregates information from the entire sequence by averaging
        pooled_logits = torch.mean(logits, dim=1, keepdim=True)

        # Reshape pooled logits to ensure the output shape is [batch_size, vocab_size]
        reshaped_pooled_logits = pooled_logits.squeeze(1)

        # Normalize pooled logits across the vocabulary dimension to get probabilities
        probs = F.softmax(reshaped_pooled_logits, dim=-1)

        return probs
    


class RefinedPredictor_v2_logit(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(RefinedPredictor_v2, self).__init__()
        # Linear layer to project from d_model to vocab_size
        self.linear = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self, x):
        # Apply the linear layer to get logits
        logits = self.linear(x)

        # Apply average pooling across the sequence length dimension (dim=1)
        # This step aggregates information from the entire sequence by averaging
        pooled_logits = torch.mean(logits, dim=1, keepdim=True)

        # Reshape pooled logits to ensure the output shape is [batch_size, vocab_size]
        reshaped_pooled_logits = pooled_logits.squeeze(1)

        # Return the reshaped pooled logits directly
        return reshaped_pooled_logits



In [None]:


def calculate_focused_loss(logits, targets, exist_mask):
    # This is a placeholder for the concept; actual implementation may vary
    # Only select logits and targets where exist_mask == 1
    
    # print("logits have shape:", logits.shape)
    # print("targets have shape:", targets.shape)
    relevant_logits = logits[exist_mask.unsqueeze(-1).expand_as(logits)].view(-1, logits.size(-1))
    relevant_targets = targets[exist_mask]
    
    # print("relevant_logits have shape:",relevant_logits.shape)
    # print("relevant_targets have shape:", relevant_targets.shape)
    # ipdb.set_trace()
    # Calculate loss on these positions only
    loss = F.cross_entropy(relevant_logits, relevant_targets)
    return loss

def calculate_improved_loss(logits, targets, exist_mask):
    # logits: Model predictions, shape [batch_size, seq_len, vocab_size]
    # targets: Target character indices, shape [batch_size, seq_len]
    # exist_mask: Mask indicating missing character positions, shape [batch_size, seq_len]

    batch_size, seq_len, vocab_size = logits.size()
    
    # Flatten to work with indices directly
    logits_flattened = logits.view(-1, vocab_size)
    targets_flattened = targets.view(-1)
    exist_mask_flattened = exist_mask.view(-1)
    
    # Filter logits and targets for missing positions only
    missing_logits = logits_flattened[exist_mask_flattened]
    missing_targets = targets_flattened[exist_mask_flattened]
    
    # Convert targets to one-hot encoded form to represent true distribution
    true_dist = torch.zeros_like(missing_logits).scatter_(1, missing_targets.unsqueeze(1), 1)
    
    # Max pooling across the missing positions for each character
    max_logits, _ = torch.max(missing_logits, dim=0, keepdim=True)
    max_logits_expanded = max_logits.expand_as(missing_logits)
    
    # Normalize the max pooled logits to get a probability distribution
    pred_prob = F.softmax(max_logits_expanded, dim=-1)
    
    # Calculate KL Divergence between normalized predictions and true distribution
    loss = F.kl_div(F.log_softmax(pred_prob, dim=-1), true_dist, reduction='batchmean')

    return loss


def kl_div_loss(output_probs, targets, exist_mask):
    """
    Calculate the KL divergence loss between model output probabilities and target distribution
    based on the frequency of missing characters.

    Parameters:
    - output_probs: Tensor [batch_size, vocab_size] - probabilities for each character from the model.
    - targets: Tensor [batch_size, seq_len] - indices of all characters in each sequence.
    - exist_mask: Tensor [batch_size, seq_len] - indicates positions of missing characters (1 for missing).

    Returns:
    - Loss value as a scalar tensor.
    """
    batch_size, seq_len = targets.size()
    vocab_size = output_probs.size(1)
    device = output_probs.device
    
    # Initialize a tensor to store the target distributions
    target_distributions = torch.zeros(batch_size, vocab_size, device=device)
    
    for i in range(batch_size):
        # Identify the indices of missing characters for this sequence
        missing_indices = exist_mask[i].bool()
        missing_chars = targets[i, missing_indices]
        
        # Build the frequency distribution of missing characters
        for char_idx in missing_chars:
            target_distributions[i, char_idx] += 1
        
        # Normalize the distribution to sum to 1
        target_distributions[i] /= target_distributions[i].sum()
    
    # Calculate the KL divergence loss
    loss = F.kl_div(output_probs.log(), target_distributions, reduction='batchmean')
    return loss




def create_presence_distribution(targets, mask):
    """
    Creates a presence/absence distribution tensor based on characters selected by the mask.

    Parameters:
    - targets: Tensor of shape [batch_size, seq_len] with character indices.
    - mask: Tensor of shape [batch_size, seq_len] indicating characters to consider (1) or ignore (0).

    Returns:
    - A tensor of shape [batch_size, 26] where each element is 1 if the character is present and 0 otherwise.
    """
    device = targets.device
    batch_size, seq_len = targets.size()
    vocab_size = 26  # Fixed size for the alphabet

    # Initialize the distribution tensor
    target_distribution = torch.zeros(batch_size, vocab_size, dtype=torch.float, device=device)
    
    # Apply the mask to filter targets
    filtered_targets = targets * mask
    
    # Flatten the filtered targets and create a corresponding batch index tensor
    flat_filtered_targets = filtered_targets.flatten()
    flat_batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, seq_len).flatten()
    
    # Filter out zeros added by masking
    valid_indices = flat_filtered_targets.nonzero().squeeze()
    valid_targets = flat_filtered_targets[valid_indices]
    valid_batch_indices = flat_batch_indices[valid_indices]
    
    # Accumulate counts in the target distribution tensor
    target_distribution.index_put_((valid_batch_indices, valid_targets), 
                                   torch.ones_like(valid_targets, dtype=torch.float), 
                                   accumulate=True)
    
    # Convert counts to presence (1) or absence (0)
    presence_distribution = (target_distribution > 0).float()

    return presence_distribution





def cross_entropy_costum(output_probs, targets, exist_mask):
    """
    Calculate the KL divergence loss between model output probabilities and target distribution
    based on the frequency of missing characters, avoiding explicit loops.

    Parameters:
    - output_probs: Tensor [batch_size, vocab_size] - probabilities for each character from the model.
    - targets: Tensor [batch_size, seq_len] - indices of all characters in each sequence.
    - exist_mask: Tensor [batch_size, seq_len] - indicates positions of missing characters (1 for missing).

    Returns:
    - Loss value as a scalar tensor.
    """
    
    frequency = create_presence_distribution(targets, exist_mask)
    
    loss = F.binary_cross_entropy(output_probs, frequency)

    
    return loss



In [None]:

def kl_div_loss_fast(output_probs, targets, exist_mask):
    """
    Calculate the KL divergence loss between model output probabilities and target distribution
    based on the frequency of missing characters, avoiding explicit loops.

    Parameters:
    - output_probs: Tensor [batch_size, vocab_size] - probabilities for each character from the model.
    - targets: Tensor [batch_size, seq_len] - indices of all characters in each sequence.
    - exist_mask: Tensor [batch_size, seq_len] - indicates positions of missing characters (1 for missing).

    Returns:
    - Loss value as a scalar tensor.
    """
    batch_size, seq_len = targets.size()
    vocab_size = output_probs.size(1)
    device = output_probs.device

    # Create a flat index for batch elements
    batch_indices = torch.arange(batch_size).unsqueeze(1).expand(-1, seq_len).reshape(-1)
    # Flatten targets and mask to use for filtering missing characters
    flat_targets = targets.reshape(-1)
    flat_mask = exist_mask.reshape(-1)

    # Filter to get indices and targets of missing characters only
    missing_indices = batch_indices[flat_mask]
    missing_targets = flat_targets[flat_mask]

    # Initialize tensor to store the target distributions
    target_distributions = torch.zeros(batch_size, vocab_size, device=device)

    # Use scatter_add_ to accumulate the counts of each character directly into the target distribution
    target_distributions.index_put_((missing_indices, missing_targets), torch.ones_like(missing_targets, dtype=torch.float), accumulate=True)

    # Normalize each row to sum to 1 to get proper distributions
    target_distributions /= target_distributions.sum(dim=1, keepdim=True)

    # Calculate the KL divergence loss
    loss = F.kl_div(output_probs.log(), target_distributions, reduction='batchmean')

    return loss


In [None]:
class HangmanPredictor(nn.Module):
    def __init__(self, vocab_size, d_model, output_size, n_enc_layers=4):
        super(HangmanPredictor, self).__init__()
        # Initialize embeddings and encoder as before
        self.input_embedding = InputEmbedding(d_model=d_model, vocab_size=vocab_size)
        self.position_embedding = PositionEmbedding(d_model=d_model, dropout=0.1, max_len=1000)

        self_attention_layer = MultiHeadAttentionSec(d_model, h = 4, dropout=0.1)
        feed_forward_layer = FeedForwardSec(d_model, d_ff = 512, dropout=0.1)
        encoder_layer = EncoderSec(self_attention_layer, feed_forward_layer, dropout=0.1)
        self.encoder = Encoder(nn.ModuleList([encoder_layer for _ in range(n_enc_layers)]))

        self.predictor = RefinedPredictor(d_model = d_model, vocab_size = output_size)

    def forward(self, x, mask=None, exist_mask=None):
        # Apply input and position embeddings
        x = self.input_embedding(x)
        x = self.position_embedding(x)
        
        # Pass through the encoder layers
        x = self.encoder(x, mask)
        
        # Generate predictions using the predictor
        predictions = self.predictor(x)
        return predictions
    
    @property
    def device(self):
        # Return the device of the model's parameters
        return next(self.parameters()).device

In [None]:


def train(model, data_loader, optimizer, epochs=10):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(data_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=True)
        for sample, target, exist_mask in progress_bar:
            optimizer.zero_grad()
            
            # Adjust dimensions if necessary
            sample = sample.to(model.device)
            target = target.to(model.device)
            exist_mask = exist_mask.to(model.device)
            
            # Forward pass
            output = model(sample)
            # print("The shape of output is:", output.shape)
            
            # Compute custom loss
            # loss = calculate_focused_loss(output, target, exist_mask)
            loss = calculate_improved_loss(output, target, exist_mask)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")




In [None]:


def train_v2(model, train_loader, val_loader, optimizer, epochs=10):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=True)
        
        for sample, target, exist_mask in progress_bar:
            optimizer.zero_grad()
            
            # Adjust dimensions if necessary
            sample = sample.to(model.device)
            target = target.to(model.device)
            exist_mask = exist_mask.to(model.device)
            
            # Forward pass through the model to get output probabilities
            output_probs = model(sample)
            
            # Ensure the output is compatible with the KL divergence loss expectations
            # This might require you to adjust either the model's output or how you handle it here,
            # depending on the output format of your ImprovedPredictor.
            
            # Compute the KL divergence loss
            loss = kl_div_loss_fast(output_probs, target, exist_mask)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            # Accumulate loss
            total_loss += loss.item()
        
        # Calculate average loss for the epoch
        avg_train_loss = total_loss / len(train_loader)
        
        # Validation Phase
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():  # No need to track gradients during validation
            val_progress_bar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]', leave=True)
            for sample, target, exist_mask in val_progress_bar:
                sample = sample.to(model.device)
                target = target.to(model.device)
                exist_mask = exist_mask.to(model.device)
                
                output_probs = model(sample)
                val_loss = kl_div_loss_fast(output_probs, target, exist_mask)
                
                total_val_loss += val_loss.item()
                
        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

        
        


In [None]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    samples, targets, exist_masks = zip(*batch)

    # Pad samples and exist_masks to have the same length
    # samples_padded = pad_sequence([torch.tensor(s, dtype=torch.long) for s in samples], batch_first=True, padding_value=0)
    # targets_padded = pad_sequence([torch.tensor(t, dtype=torch.long) for t in targets], batch_first=True, padding_value=-1)  # Using -1 as padding value for targets
    samples_padded = pad_sequence([s.clone().detach() for s in samples], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([t.clone().detach() for t in targets], batch_first=True, padding_value=-1)


    # Since exist_mask indicates positions that should be predicted, we can treat it similarly to targets
    #exist_masks_padded = pad_sequence([torch.tensor(em, dtype=torch.bool) for em in exist_masks], batch_first=True, padding_value=False)  # False indicates non-missing positions
    exist_masks_padded = pad_sequence([em.clone().detach() for em in exist_masks], batch_first=True, padding_value=False)
    return samples_padded, targets_padded, exist_masks_padded

batch_size = 3200
# Use the custom collate function in your DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn, num_workers = 4, pin_memory =True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)


In [None]:
# Model
vocab_size = 27  # Including "_"
d_model = 128
output_size = 26
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HangmanPredictor(vocab_size=vocab_size, d_model=d_model, output_size=output_size,n_enc_layers=4)

In [15]:
# model.load_state_dict(torch.load("attention_model_parameters_v1.pth")) # d_model = 256 d_ff 1024
# model.load_state_dict(torch.load("attention_model_parameters_v2.pth")) # d_model = 256 d_ff = 1024
model.load_state_dict(torch.load("attention_model_parameters_v3.pth")) # d_model = 128 d_ff = 512

<All keys matched successfully>

In [16]:
# Move the model to the device
model = model.to(device)


# Optimizer
optimizer = Adam(model.parameters(), lr=0.001)

# Train
train_v2(model=model, train_loader=train_loader, val_loader= val_loader,optimizer=optimizer, epochs=10)

Epoch 1/10:   0%|          | 0/11718 [00:10<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 170.00 MiB (GPU 0; 5.38 GiB total capacity; 3.97 GiB already allocated; 0 bytes free; 4.07 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [22]:
# torch.save(model.state_dict(), "attention_model_parameters_v2.pth")
torch.save(model.state_dict(), "attention_model_parameters_v3.pth")

In [18]:
def predict_one_missing_character(hangman_model, guessing_word, excluded_chars, char_to_index, index_to_char):
    hangman_model.eval()  # Ensure the model is in evaluation mode

    # Convert the guessing word into tensor
    input_indices = [char_to_index.get(char, char_to_index['_']) for char in guessing_word]
    input_tensor = torch.tensor([input_indices], dtype=torch.long).to(hangman_model.device)

    # Predict probabilities
    with torch.no_grad():
        output_probs = hangman_model(input_tensor)[0]  # Assuming the output is in the desired format
        # print(output_probs.shape)
        # print(output_probs)
    
    # Apply softmax to convert logits into probabilities
    # probs = F.softmax(output_probs, dim=-1)

    # Identify the index of the first missing character
    missing_idx = guessing_word.find('_')
    if missing_idx == -1:
        return None  # No missing character found
    
    # Mask out excluded characters by setting their probabilities to 0
    for char in excluded_chars:
        if char in char_to_index:
            output_probs[char_to_index[char]] = 0
    print(output_probs)
    # Find the most probable character for the missing position
    max_value_index = torch.argmax(output_probs)
    # print(index_to_char)
    # print(max_value_index)
    print(char_to_index)
    predicted_char = index_to_char[max_value_index.item()]

    return predicted_char
# Example usage, assuming char_to_index and index_to_char are defined properly
char_to_index = char_to_num_input = {char: i for i, char in enumerate(chars)}
index_to_char = {index: char for char, index in char_to_index.items()}
guessing_word = "m_gi_"
excluded_chars = []
predicted_word = predict_one_missing_character(model, guessing_word, excluded_chars, char_to_index, index_to_char)
print(predicted_word)


tensor([2.5458e-04, 1.3037e-03, 1.6567e-02, 9.9969e-03, 4.4593e-02, 2.5904e-04,
        5.2739e-05, 4.9735e-02, 1.1763e-04, 3.8575e-05, 1.6386e-03, 2.8374e-02,
        9.7916e-05, 2.6587e-01, 1.2098e-01, 5.9776e-03, 1.1408e-05, 1.8605e-01,
        2.7630e-02, 7.6370e-02, 1.1172e-01, 3.2902e-04, 6.5527e-04, 1.4593e-03,
        4.3444e-02, 6.4802e-03])
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '_': 26}
n


In [17]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Attention parameter v1 is use refined predictor d_model = 256 d_ff = 1024

In [None]:
# Attention parameter v2 is use refined predictor_v2 and cross entrophy d_model = 256, d_ff = 1024

In [None]:
# Attention parameter v3 d_model = 128, d_ff = 512