# Simple Language Model
Lets say we have this as training: "sample text", and a window_size = 3

We shall split this to -
- "sam" --> "p"
- "amp" --> "l"
- "mpl" --> "e"
- "ple" --> " "
- "le " --> "t"
- "e t" --> "e"
- " te" --> "x"
- "tex" --> "t"

Each character will have its own embedding, and given three characters in a sequence, we shall predict the next character - making this equivalent to a classifcation problem

In [1]:
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm

### Custom Dataset

In [None]:
class CharacterDataset(Dataset):
    """Custom Dataset
    
    Parameters
    ----------
    text : str
        Input text that will be used to create the entire database.
    
    window_size : int
        Number of characters to use as input features. Default = 3
        
    vocab_size : int
        Number of characters in the vocabulary. Note that the 
        last character is always reserved for a special "~" 
        out-of-vocabulary character (<UNK> token). Default = 50
    
    Attributes
    ----------
    ch2ix : default_dict
        Mapping from the character to the position of that 
        character in the vcabulary. Note that all characters
        that are not in the vocabulary  will get mapped into 
        the index `vocab_size - 1`.
    
    ix2ch : dict
        Mapping from the character position in the vocabulary
        to te actual character.
    
    vocabulary : list
        List of all characters. `len(vocabulary) == vocab_size`.
    
    """

    def __init__(self, text, window_size = 3, vocab_size = 50) -> None:
        self.text = text.replace("\n", " ") # Replacing next line to a space
        self.window_size =window_size
        self.ch2ix = defaultdict(lambda: vocab_size - 1)