### 1. Installa i pacchetti e i file necessari

In [None]:
!pip install keras-nlp==0.4.1
!wget https://www.dropbox.com/s/kuxjrdz9kwxdovg/gpt_v15000_l3_h4_e100.h5
!wget https://www.dropbox.com/s/yze44qacqgqmd1u/vocab_15000.txt

In [None]:
import  os
import  keras_nlp
import  string
import  numpy         as      np
import  tensorflow    as      tf
from    tensorflow    import  keras

### 2. Crea il tokenizer

In [None]:
def set_tokenizer():
    fname       = "vocab_15000.txt"
    with open( fname, 'r' ) as f:
        vocab       = f.read()
    vocab       = vocab.split()
    tokenizer   = keras_nlp.tokenizers.WordPieceTokenizer(
            vocabulary      = vocab,
            sequence_length = 128,
            lowercase       = True
    )
    return tokenizer

In [None]:
# testo -> lista di token
def to_tokens( text, tokenizer ):
    tokens      = tokenizer( text.lower() )
    tokens      = tokens.numpy()
    return np.trim_zeros( tokens )

# lista di token -> testo    
def from_tokens( tokens, tokenizer ):
    text        = tokenizer.detokenize( tokens )
    return text.numpy()    

### 3. Testa il tokenizer

In [None]:
tokenizer = set_tokenizer()

In [None]:
s = "Why you don't you say something about it? It's wonderful!"
t = to_tokens( s, tokenizer )
print( t )

In [None]:
from_tokens( t, tokenizer ).decode( "utf-8" )

In [None]:
from_tokens( np.arange( 300, 320 ), tokenizer ).decode( "utf-8" )

### 4. Crea il modello di linguaggio

In [None]:
def create_model():
    n_layers    = 3
    vocab_size  = 15000
    inputs      = keras.layers.Input( shape=(None,), dtype=tf.int32 )

    embedding   = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size = vocab_size,
            sequence_length = 128,
            embedding_dim   = 256,
            mask_zero       = True
    )
    x = embedding( inputs )

    for i in range( n_layers ):
        name    = "decoder_{:02d}".format( i )
        decoder = keras_nlp.layers.TransformerDecoder(
            num_heads           = 4,
            intermediate_dim    = 256
        )
        x = decoder( x )

    outputs     = keras.layers.Dense( vocab_size )( x )
    model       = keras.Model( inputs=inputs, outputs=outputs )

    return model

### 5. Inizialliza il modello

In [None]:
def init():
    model     = create_model()
    weights   = "gpt_v15000_l3_h4_e100.h5"
    model.load_weights( weights )
    return model

In [None]:
model = init()

### 6. Usa il modello per fare completion

In [None]:
def get_prompt( tokenizer, prompt=None ):
    p = ''
    for c in prompt:
        if c in string.punctuation:
            p = p + ' ' + c
        else:
            p = p + c
    prompt = p
    
    if prompt is None:
        prompt_list     = [ tokenizer.token_to_id( "[BOS]" ) ]
    else:
        prompt_list     = [ tokenizer.token_to_id( w ) for w in prompt.lower().split() ]
    prompt_tokens   = tf.convert_to_tensor( prompt_list )
    return prompt_tokens

In [None]:
def next_token( model, tokenizer, prompt=None ):
    prompt_tokens   = get_prompt( tokenizer, prompt )
    prompt_tokens   = prompt_tokens[ tf.newaxis, : ]
    prediction      = model( prompt_tokens )
    max_prob        = tf.argmax( prediction, axis=-1 ).numpy()
    token 			= from_tokens( max_prob, tokenizer )
    token 			= token[ 0 ].decode( "utf-8" )
    token 			= token.split( ' ' )[ -1 ]
    return token

In [None]:
def basic_completion( model, tokenizer, prompt=None, max_length=100 ):
    p = prompt
    while len( p ) < max_length:
        t = next_token( model, tokenizer, prompt=p )
        p = p + ' ' + t
    return p

In [None]:
def top_p_completion( model, tokenizer, prompt=None, max_length=100 ):
    prompt_tokens   = get_prompt( tokenizer, prompt )

    def predict_fn( inputs ):
        cur_len     = inputs.shape[ 1 ]
        output      = model( inputs )
        return output[ :, cur_len - 1, : ]

    output_tokens = keras_nlp.utils.top_p_search(
        predict_fn,
        prompt_tokens,
        max_length      = max_length,
        p               = 0.5,
        from_logits     = True
    )
    text            = tokenizer.detokenize( output_tokens )
    text            = text.numpy()
    return text

### 7. Testa la completion

In [None]:
prompt = "There is something in here that we"
next_token( model, tokenizer, prompt=prompt )

In [None]:
prompt = "There is something in here that we"
basic_completion( model, tokenizer, prompt=prompt )

In [None]:
prompt = "There is something in here that we"
top_p_completion( model, tokenizer, prompt=prompt ).decode( "utf-8" )