Turning long strings of text into machine learning models for prediction is all about predicting the next character from a context of previous characters.

Given a string, and a context window length, a string is transformed like this:

'Hello world!', 5

'Hello' -> ' '
'ello ' -> 'w'
'llo w' -> 'o'

So the first order of business is building a transformer to build strings in such a fashion.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
import html
import numpy as np

class LanguageModelSequencer(BaseEstimator, TransformerMixin):
    '''
    Transform a string into context and target character sequence numbers, using the ordinal
    value of each character.
    
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.context_length = context_length
        self.maximum_ordinal = maximum_ordinal
        # delegate actual parsing to scikit-learn
        self.wordbreaker = CountVectorizer(lowercase=True, ngram_range=(1,1), analyzer='char').build_analyzer()
        
    def fit(self, strings, **kwargs):
        '''
        No need to fit
        '''
        return self

    def transform(self, strings):
        '''
        Transform an iterable source of strings into a dense matrix
        of character identifiers.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character identifier, and
            a one dimensional [sample_index] with a 32 bit character identifier to predict.
        '''
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        contexts = []
        targets = []
        for i, string in enumerate(strings):
            for j in range(0, len(string) - self.context_length):
                contexts.append(string[j:j + self.context_length])
                targets.append(string[j + self.context_length])
        # blocks of memory to hold character ordinals
        X = np.zeros((len(contexts), self.context_length), dtype=np.int32)
        Y = np.zeros(len(targets), dtype=np.int32)
        for i, context in enumerate(contexts):
            for j, character in enumerate(context):
                X[i, j] = min(ord(character), self.maximum_ordinal)
        for i, character in enumerate(targets):
            Y[i] = min(ord(character), self.maximum_ordinal)
        return X, Y
                


In [2]:
np.set_printoptions(linewidth=100)
LanguageModelSequencer().fit_transform('Transform a string into context and target character sequence numbers.')

(array([[ 84, 114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105],
        [114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110],
        [ 97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103],
        [110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32],
        [115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105],
        [102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110],
        [111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116],
        [114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111],
        [109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32],
        [ 32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99],
        [ 97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99, 111],
        [ 

With sequences in hand -- we have two basic approaches to vectorizing:
* Normalization, which uses the same amount of memory
* One Hot, which makes another dimension in the matrix, using more memory


In [3]:
class BaseLanguageModel(BaseEstimator, TransformerMixin):
    '''
    Base language model uses a LanguageModelSequencer to create character ordinals
    and then applies a final transformation in order to create vectors.
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.sequencer = LanguageModelSequencer(context_length=16, maximum_ordinal=maximum_ordinal)
    
    def fit(self, strings):
        '''
        Nothing to fit.
        '''
        return self
    

First with the normalized model, this is as simple as division by the max character number seen. It uses no more memory than sequence numbers, the trick is reading back predictions -- which will be floating point, with the need to round to decode.

In [4]:
class NormalizedLanguageModel(BaseLanguageModel):
    '''
    Create a language model with normalized representations of characters on the
    range of 0-1.
    '''
    
    def transform(self, strings):
        '''
        Transform strings into a dense (X, Y) pairing
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character encoding, and
            a one dimensional [sample_index] with a 32 bit character encoding to predict.
        '''
        X, Y = self.sequencer.transform(strings)
        return X / self.sequencer.maximum_ordinal, Y / self.sequencer.maximum_ordinal
    
    def inverse_transform(self, X):
        '''
        Given a matrix of numbers, reverse the transformation and return a matrix of characters.
        '''
        scaled = X * self.sequencer.maximum_ordinal
        ordinals = scaled.round().astype(np.int32)
        decode = np.vectorize(chr)
        return decode(ordinals)

In [5]:
X, Y = NormalizedLanguageModel().fit_transform('Transform a string into context and target character sequence numbers.')

In [6]:
X, Y

(array([[ 0.00128174,  0.0017395 ,  0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,
          0.0017395 ,  0.00166321,  0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,
          0.0017395 ,  0.00160217],
        [ 0.0017395 ,  0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,
          0.00166321,  0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,
          0.00160217,  0.00167847],
        [ 0.0014801 ,  0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,  0.00166321,
          0.00048828,  0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,  0.00160217,
          0.00167847,  0.00157166],
        [ 0.00167847,  0.00175476,  0.0015564 ,  0.00169373,  0.0017395 ,  0.00166321,  0.00048828,
          0.0014801 ,  0.00048828,  0.00175476,  0.00177002,  0.0017395 ,  0.00160217,  0.00167847,
          0.00157166,  0.00048828],
        [ 0.00175476,  0.0015564 ,  0.00169373,  0.00173

Here is the inverse transform to reconstruct characters from numbers.

In [7]:
NormalizedLanguageModel().inverse_transform(X)

array([['T', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i'],
       ['r', 'a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n'],
       ['a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g'],
       ['n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' '],
       ['s', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i'],
       ['f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n'],
       ['o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't'],
       ['r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o'],
       ['m', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' '],
       [' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c'],
       ['a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c', 'o'],
       [' ', 's', 't'

In [8]:
NormalizedLanguageModel().inverse_transform(Y)

array(['n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c', 'o', 'n', 't', 'e', 'x', 't', ' ', 'a', 'n',
       'd', ' ', 't', 'a', 'r', 'g', 'e', 't', ' ', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r',
       ' ', 's', 'e', 'q', 'u', 'e', 'n', 'c', 'e', ' ', 'n', 'u', 'm', 'b', 'e', 'r', 's', '.'],
      dtype='<U1')

Now for the one hot encoded model. This will use more memory, but the output can simply be like predicting classes with softmax, just pick the character with the largest predicted value!

Let's see how much memory we used.

In [9]:
X.nbytes + Y.nbytes

7344

In [10]:
from keras.utils import to_categorical
class OneHotLanguageModel(BaseLanguageModel):
    '''
    Create a language model with one hot representations in a character matrix.
    '''
    
    def transform(self, strings):
        '''
        Transform strings into a dense (X, Y) pairing
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 3 dimensional [sample_index, character_position, one_hot], 
            a 2 dimensional [sample_index, one_hot].
        '''
        X, Y = self.sequencer.transform(strings)
        # to_categorical flattens the dimensions, so reshape to our sample/character/one_hot goal
        # this is a lot faster than making another loop
        X_target_shape = (X.shape[0], X.shape[1], self.sequencer.maximum_ordinal) 
        X = to_categorical(X, self.sequencer.maximum_ordinal).reshape(X_target_shape)
        Y = to_categorical(Y, self.sequencer.maximum_ordinal)
        return X, Y
    
    def inverse_transform(self, X):
        '''
        Given a matrix of one hot encodings, reverse the transformation and return a matrix of characters.
        '''
        ordinals = X.argmax(-1).astype(np.int32)
        decode = np.vectorize(chr)
        return decode(ordinals)

Using TensorFlow backend.
  return f(*args, **kwds)


In [11]:
X, Y = OneHotLanguageModel().fit_transform('Transform a string into context and target character sequence numbers.')

In [12]:
X, Y

(array([[[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        ..., 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0

In [13]:
OneHotLanguageModel().inverse_transform(X)

array([['T', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i'],
       ['r', 'a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n'],
       ['a', 'n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g'],
       ['n', 's', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' '],
       ['s', 'f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i'],
       ['f', 'o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n'],
       ['o', 'r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't'],
       ['r', 'm', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o'],
       ['m', ' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' '],
       [' ', 'a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c'],
       ['a', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c', 'o'],
       [' ', 's', 't'

In [14]:
OneHotLanguageModel().inverse_transform(Y)

array(['n', 'g', ' ', 'i', 'n', 't', 'o', ' ', 'c', 'o', 'n', 't', 'e', 'x', 't', ' ', 'a', 'n',
       'd', ' ', 't', 'a', 'r', 'g', 'e', 't', ' ', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r',
       ' ', 's', 'e', 'q', 'u', 'e', 'n', 'c', 'e', ' ', 'n', 'u', 'm', 'b', 'e', 'r', 's', '.'],
      dtype='<U1')

And again, check the memory used.

In [15]:
X.nbytes + Y.nbytes

481296384