In [None]:
!nvidia-smi

In [None]:
# upload private_key.pem and authorized_keys to /content

In [None]:
!git clone https://github.com/wojtekcz/poetry2021.git

In [None]:
!poetry2021/scripts/setup_colab_finetuning.sh

In [None]:
# !SSH_RELAY_HOST=wcz@bekaes.beanflows.com SSH_RELAY_PORT=8888 bash <(curl -s https://raw.githubusercontent.com/wojtekcz/poetry2021/master/colab_ssh/colab_ssh_server.sh)

In [None]:
from pathlib import Path

In [None]:
import sys
sys.path.insert(0, '/content/poetry2021/src')

In [None]:
from preprocessing.text_tokenizer import TextTokenizer
from transformers import (
    AutoTokenizer,
    AutoModel,
    GPT2LMHeadModel,
    GPT2Tokenizer
)

import torch

In [None]:
data_path = Path('/content/poetry2021/data/pan_tadeusz7')
dataset_path = data_path / 'dataset'
vocab_path = data_path / 'vocab.json'
tokenizer_path = data_path / 'tokenizer'

text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(vocab_path)

vocab = text_tokenizer.vocab
vocab_count = len(vocab.keys())
vocab.update({'<|endoftext|>': vocab_count})

In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
print(tokenizer)

In [None]:
models_path = Path('/content/poetry2021/data/pan_tadeusz7_lm_models')
model_path = models_path / 'model1000'

In [None]:
USE_GPU = torch.cuda.is_available()
# USE_GPU = False
print(f'USE_GPU={USE_GPU}')

In [None]:
def to_gpu(x, *args, **kwargs):
    return x.cuda(*args, **kwargs) if USE_GPU else x

In [None]:
model = GPT2LMHeadModel.from_pretrained(str(model_path))
model = to_gpu(model)
model.device

# generate
model.eval();

In [None]:
def print_eval(generated):
    # print(f'bad_words: {bad_words(generated)}')
    e_syl = generated.split(' ')
    decoded = text_tokenizer.decode_caps(text_tokenizer.syl2str(e_syl, delim=''))
    print(text_tokenizer.fix_punctuation(decoded))
    # display(HTML(text_tokenizer.format_html(text_tokenizer.fix_punctuation(decoded))))


def evaluate(prime_str, max_length=100, temperature=0.8):
    prime_tok = text_tokenizer.str2syl2tok(prime_str)
    prime_tok_str = " ".join(prime_tok)
    ids = tokenizer.encode(prime_tok_str, return_tensors="pt")[:, :-1]
    preds = model.generate(ids.to(model.device), max_length=max_length,
                           temperature=temperature,
                           num_beams=10, early_stopping=True,
                           no_repeat_ngram_size=2,
                           do_sample=True,
                           top_k=50,
                           top_p=0.92
                           )
    return tokenizer.decode(preds[0])


max_length = 500
gen1 = evaluate('chwycił na taśmie przypięty', max_length=max_length, temperature=1.0)
print_eval(gen1)
gen1