Turning long strings of text into machine learning models for prediction is all about predicting the next character from a context of previous characters.

Given a string, and a context window length, a string is transformed like this:

'Hello world!', 5

'Hello' -> ' '
'ello ' -> 'w'
'llo w' -> 'o'

So the first order of business is building a transformer to build strings in such a fashion.

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
import html
import numpy as np

class LanguageModelSequencer(BaseEstimator, TransformerMixin):
    '''
    Transform a string into context and target character sequence numbers, using the ordinal
    value of each character.
    
    '''
    def __init__(self, context_length=16, maximum_ordinal=2**16):
        '''
        Parameters
        ----------
        context_length : int
            This number of characters will be used as a context to predict future characters.
        maximum_ordinal : int
            Limit total memory use in case you run into very high unicode characters.
        '''
        self.context_length = context_length
        self.maximum_ordinal = maximum_ordinal
        # delegate actual parsing to scikit-learn
        self.wordbreaker = CountVectorizer(lowercase=True, ngram_range=(1,1), analyzer='char').build_analyzer()
        
    def fit(self, strings, **kwargs):
        """
        No need to fit
        """
        return self

    def transform(self, strings):
        """
        Transform an iterable source of strings into a dense matrix
        of character identifiers.
        
        Parameters
        ----------
        strings : iterable
            An iterable of source strings to vectorize.
       
       Returns
        -------
        (np.ndarray, np.ndarray)
            A tuple (X, Y) 2 dimensional [sample_index, character], with a 32 bit character identifier, and
            a one dimensional [sample_index] with a 32 bit character identifier to predict.
        """
        # forgive passing a single string
        if type(strings) is str:
            strings = [strings]
        contexts = []
        targets = []
        for i, string in enumerate(strings):
            for j in range(0, len(string) - self.context_length):
                contexts.append(string[j:j + self.context_length])
                targets.append(string[j + self.context_length])
        # blocks of memory to hold character ordinals
        X = np.zeros((len(contexts), self.context_length), dtype=np.int32)
        Y = np.zeros(len(targets), dtype=np.int32)
        for i, context in enumerate(contexts):
            for j, character in enumerate(context):
                X[i, j] = min(ord(character), self.maximum_ordinal)
        for i, character in enumerate(targets):
            Y[i] = min(ord(character), self.maximum_ordinal)
        return X, Y
                


In [48]:
np.set_printoptions(linewidth=100)
LanguageModelSequencer().fit_transform('Transform a string into context and target character sequence numbers.')

(array([[ 84, 114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105],
        [114,  97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110],
        [ 97, 110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103],
        [110, 115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32],
        [115, 102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105],
        [102, 111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110],
        [111, 114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116],
        [114, 109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111],
        [109,  32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32],
        [ 32,  97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99],
        [ 97,  32, 115, 116, 114, 105, 110, 103,  32, 105, 110, 116, 111,  32,  99, 111],
        [ 