Turning long strings of text into machine learning models for prediction is all about predicting the next character from a context of previous characters.

Given a string, and a context window length, a string is transformed like this:

'Hello world!', 5

'Hello' -> ' '
'ello ' -> 'w'
'llo w' -> 'o'

So the first order of business is building a transformer to build strings in such a fashion.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
import html
import numpy as np

class CharacterEncoder(BaseEstimator, TransformerMixin):
    '''
    Transform a string into context and target character sequence numbers, using the ordinal
    value of each character.
    
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.context_length = context_length
        self.maximum_ordinal = maximum_ordinal
        
    def fit(self, strings, **kwargs):
        '''
        No need to fit.
        '''
        return self

    def transform(self, strings):
        '''
        Transform an iterable source of strings into a dense matrix
        of character identifiers.
        
        Each sample will be a string snippet of context_length characters.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character identifier, and
            a one dimensional [sample_index] with a 32 bit character identifier to predict.
        '''
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        # buffer up contexts and targets, we'll be pridicting target characters
        # from context strings
        contexts = []
        targets = []
        for i, string in enumerate(strings):
            # lowercase and stripped of leading whitespace, makes the model more compact
            string = string.lower().strip()
            # null character termination for each string
            string += chr(0)
            for j in range(0, len(string) - self.context_length):
                contexts.append(string[j:j + self.context_length])
                targets.append(string[j + self.context_length])
        # blocks of memory to hold character ordinals
        X = np.zeros((len(contexts), self.context_length), dtype=np.int32)
        Y = np.zeros(len(targets), dtype=np.int32)
        # numerical encoding of character values
        for i, context in enumerate(contexts):
            for j, character in enumerate(context):
                X[i, j] = min(ord(character), self.maximum_ordinal)
        for i, character in enumerate(targets):
            Y[i] = min(ord(character), self.maximum_ordinal)
        return X, Y


In [2]:
np.set_printoptions(linewidth=100)
CharacterEncoder().fit_transform('Transform a string into context and target character sequence numbers.')

(array([[116, 114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105],
        [114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110],
        [ 97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103],
        [110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32],
        [115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105],
        [102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110],
        [111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116],
        [114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111],
        [109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32],
        [ 32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99],
        [ 97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99, 111],
        [ 

With sequences in hand -- we have two basic approaches to vectorizing:
* Normalization, which uses the same amount of memory
* One Hot, which makes another dimension in the matrix, using more memory

In this encoder, we will be using both -- normalized inputs and one hot encoded outputs.


In [3]:
class LanguageModelVectorizer(BaseEstimator, TransformerMixin):
    '''
    Base language model uses a CharacterEncoder to create character ordinals
    and then applies a transformation in order to create vectors.
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.sequencer = CharacterEncoder(context_length, maximum_ordinal)
    
    def fit(self, strings):
        '''
        Nothing to fit.
        '''
        return self
    
    def transform(self, strings):
        '''
        Transform strings into a dense (X, Y) pairing.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character encoding, and
            a one dimensional [sample_index] with a 32 bit character encoding to predict.
        '''
        # character sequence numbers
        X, Y = self.sequencer.transform(strings)
        # one hot context encoding
        x = np.zeros((X.shape[0], self.sequencer.context_length, self.sequencer.maximum_ordinal), dtype=np.bool)
        y = np.zeros((Y.shape[0], self.sequencer.maximum_ordinal), dtype=np.bool)
        for i, context in enumerate(X):
            for t, char in enumerate(context):
                x[i, t, char] = 1
        for i, target in enumerate(Y):
            y[i, target] = 1
        return x, y
    
    def inverse_transform(self, X):
        '''
        Given a matrix of one hot encodings, reverse the transformation and return a matrix of characters.
        '''
        ordinals = X.argmax(-1).astype(np.int32)
        decode = np.vectorize(chr)
        return ''.join(decode(ordinals))

First with the normalized model, this is as simple as division by the max character number seen. It uses no more memory than sequence numbers, the trick is reading back predictions -- which will be floating point, with the need to round to decode.

In [4]:
X, Y = LanguageModelVectorizer().fit_transform('Transform a string into context and target character sequence numbers.')

In [5]:
X, Y

(array([[[False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ..., 
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False]],
 
        [[False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ..., 
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False]],
 
        [[False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ..., 
         [False, False, False, ..., False, False, False],
         [False, Fals

One hot outputs will eventually be used to turn back into strings, let's make sure we can get characters from one-hots.

In [6]:
LanguageModelVectorizer().inverse_transform(Y)

'ng into context and target character sequence numbers.'

Let's see how much memory we used.

In [7]:
X.nbytes + Y.nbytes

61276160

And now, and alternate approach to encoding -- normalization to create a dense character encoding.

In [8]:
class NormalizedLanguageModelVectorizer(BaseEstimator, TransformerMixin):
    '''
    Create a language model with normalized representations of characters on the
    range of 0-1. Not a one hot encoding, but a dense encoding of character values.
    '''
    
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.sequencer = CharacterEncoder(context_length, maximum_ordinal)
    
    def fit(self, strings):
        '''
        Nothing to fit.
        '''
        return self
    
    def transform(self, strings):
        '''
        Transform strings into a dense (X, Y) pairing
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character encoding, and
            a one dimensional [sample_index] with a 32 bit character encoding to predict.
        '''
        X, Y = self.sequencer.transform(strings)
        return X / self.sequencer.maximum_ordinal, Y / self.sequencer.maximum_ordinal
    
    def inverse_transform(self, X):
        '''
        Given a matrix of numbers, reverse the transformation and return a matrix of characters.
        '''
        scaled = X * self.sequencer.maximum_ordinal
        ordinals = scaled.round().astype(np.int32)
        decode = np.vectorize(chr)
        return ''.join(decode(ordinals))

In [9]:
X, Y = NormalizedLanguageModelVectorizer().fit_transform('Transform a string into context and target character sequence numbers.')

In [10]:
X, Y

(array([[ 0.00177002,  0.0017395 ,  0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,
          0.0017395 ,  0.00166321,  0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,
          0.0017395 ,  0.00160217],
        [ 0.0017395 ,  0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,
          0.00166321,  0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,
          0.00160217,  0.00167847],
        [ 0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,  0.00166321,
          0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,  0.00160217,
          0.00167847,  0.00157166],
        [ 0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,  0.00166321,  0.00048828,
          0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,  0.00160217,  0.00167847,
          0.00157166,  0.00048828],
        [ 0.00175476,  0.0015564 ,  0.00169373,  0.00173

In [11]:
NormalizedLanguageModelVectorizer().inverse_transform(Y)

'ng into context and target character sequence numbers.'

And again a look at the memory used.

In [None]:
X.nbytes + Y.nbytes