# Synthetic Dataset

> Generating synthetic dataset for character, and sentence neural activity

In [None]:
# | default_exp synthetic

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
from typing import Tuple, Set
import random
import json

import torch
import torch.nn.functional as F
from torchtyping import TensorType

from neuraltext.dataset import get_vocabs
from neuraltext.utils import mat2dict

### Neural Template

In [None]:
#| export
def load_snippet(character: str, path: str = "./data/snippets/t5.2020.01.13_snippets.mat"):
    vocabs = get_vocabs().keys()
    assert character in vocabs, f"Vocab {character} not found"

    # TODO: don't hardcode >
    character = "greaterThan" if character == ">" else character
    snippets = mat2dict(path)
    return snippets[character]

### Sythetic Sentences

In [None]:
#| export
def generate_random_sentence(
    length: int, # The number of words in the sentence
    vocab_path: str = "./data/english/1000-english-words.json" # The path to the vocabulary
) -> str:
    with open(vocab_path, "r") as f:
        words = json.load(f)
    n_words = len(words)
    idxs = random.sample(range(n_words), length)
    text = " ".join([words[str(idx)] for idx in idxs])
    return text

Here is the explanation for the code above:
1. We first load the snippet library that we created in Step 2.
2.  We then create a list of words that we will use to generate our synthetic sentences.  We use the 'rare' word list
    file to increase the frequency of words with rare letters ('z', 'x', 'j', 'q'). The rare word file contains the
    indices of the words in 'wordListFile' with rare letters.
3.  We then generate our synthetic neural data by calling the 'makeSyntheticDataFromRawSnippets' function.
4.  We then combine the character probabilities with the character start signal.
5.  We then cut off the first part of the data so the RNN starts off "hot" randomly in the middle of text.
6.  We then bin the data if 'binSize' is greater than 1.
7.  We then create an error mask that doesn't penalize the RNN for errors that occur before the first character starts.
8.  We then save the synthetic data to a . 

""" Here is the explanation for the code above:
1. First, we define the function named makeSyntheticDataFromRawSnippets
2. Then we define the arguments it takes in.
3. Next, we define the variables that we will use in the function.
4. Then we start a for loop that will iterate over all of the sentences we want to generate.
5. Then we define the variables that we will use inside of the for loop.
6. Then we generate this sentence one character at a time.
7. Then we pick a new word if needed.
8. Then we pick the character snippet to use for the current character.
9. Then we copy the snippet for the current character.
10. Then we linearly time warp the current snippet to add more variability.
11. Then we randomly add in 'blank' pauses with some probability.
12. Then we generate probability targets for this character.
13. Then we fill in the data tensors for this character.
14. Finally, we advance pointer to the next character. """


""" Here is the explanation for the code above:
1. We define a function called makeSyntheticDataFromRawSnippets that takes as input a set of character definitions, a set of snippets, the number of sentences to make, the number of steps in each sentence, and a list of words to use. It also takes some additional arguments that control the behavior of the function. 
2. The function makes a tensor that will hold the neural data, another tensor that will hold the character probabilities, and another tensor that will hold the character start signals. 
3. The function then loops through the number of sentences, and loops through each character in the sentence, picking a snippet for each character, and then adding the snippet to the neural data tensor and the character probability tensor. 
4. The function returns the neural data tensor, the character probability tensor, and the character start signal tensor. """

In [None]:
#| export
class SyntheticSentence:
    def __init__(self):
        self.vocabs = get_vocabs()

    def word2idx(self, x: str) -> int:
        return self.vocabs[x]
    
    def _extract_unique_characters(sentence: str) -> Set[str]:
        # Use a set comprehension to create a set of unique characters
        unique_chars = {char for char in sentence if char != ' '}
        # Return the set of unique characters as a sorted list
        return sorted(list(unique_chars))
    
    def _generate_character_probs(self, sentence: str) -> TensorType["seq_len", "n_vocabs"]:
        n_vocabs = len(self.vocabs)
        
        # TODO: don't hardcode
        fixed_sentence = sentence.replace(" ", ">")
        labels = [self.word2idx(x) for x in fixed_sentence]
        labels = torch.tensor(labels)
        
        one_hot = F.one_hot(labels, n_vocabs)
        return one_hot

    def _generate_character_signal(self, sentence) -> TensorType["seq_len"]:
        seq_len = len(sentence)
        x = torch.ones(seq_len)
        x[-1] = 0
        return x

    def generate(self, sentence: str) -> Tuple[
        TensorType["n_steps", "n_channels"], # neural data
        TensorType["seq_len", "n_vocabs"], # character probabilities
        TensorType["seq_len"] # character signals
    ]:
        neural_data = torch.tensor([])
        sentence = sentence.lower()  
        for character in sentence:
            character = ">" if character == " " else character        
            neural_template = load_snippet(character)[0][0]
            neural_template = torch.tensor(neural_template)
            neural_data = torch.cat((neural_data, neural_template), dim=0)
            probs = self._generate_character_probs(sentence)
            signals = self._generate_character_signal(sentence)
        
        return neural_data, probs, signals