In [None]:
!nvidia-smi

In [None]:
# upload private_key.pem and authorized_keys to /content

In [None]:
!git clone https://github.com/wojtekcz/poetry2021.git

In [None]:
!poetry2021/scripts/setup_colab_finetuning.sh

In [None]:
# !SSH_RELAY_HOST=wcz@bekaes.beanflows.com SSH_RELAY_PORT=8888 bash <(curl -s https://raw.githubusercontent.com/wojtekcz/poetry2021/master/colab_ssh/colab_ssh_server.sh)

In [None]:
from pathlib import Path

In [None]:
import sys
sys.path.insert(0, '/content/poetry2021/src')

In [None]:
from preprocessing.text_tokenizer import TextTokenizer
from transformers import (
    AutoTokenizer,
    AutoModel,
    GPT2LMHeadModel,
    GPT2Tokenizer
)

import torch

In [None]:
data_path = Path('/content/poetry2021/data/pan_tadeusz7')
dataset_path = data_path / 'dataset'
vocab_path = data_path / 'vocab.json'
tokenizer_path = data_path / 'tokenizer'

text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(vocab_path)

vocab = text_tokenizer.vocab
vocab_count = len(vocab.keys())
vocab.update({'<|endoftext|>': vocab_count})

In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
print(tokenizer)

In [None]:
# preprocess dataset

In [None]:
from pathlib import Path
from preprocessing.stemmer import Stemmer
from preprocessing.line_chunker import LineChunker, flatten
from preprocessing.text_processor import TextProcessor
from preprocessing.text_tokenizer import TextTokenizer
from typing import List
from pprint import pprint
from tqdm import tqdm

In [None]:
dataset_path = data_path / 'dataset'
fn_corpus_char = dataset_path / 'pan_tadeusz.txt'
vocab_path = data_path / 'vocab.json'
stem_delim = '++ --'

In [None]:
def print_head(file_path, n_lines=10):
    print('\n'.join(file_path.read_text().split('\n')[:n_lines]))


class DatasetPreprocessor:
    def __init__(self, dataset_path: Path, tokenizer: TextTokenizer):
        self.tokenizer = tokenizer #TextTokenizer(dataset_path)
        self.processor = TextProcessor(dataset_path, self.tokenizer)

    def tokenize_caps(self, fn_corpus_char: Path, fn_corpus_caps: Path, verbose: True):
        if verbose:
            print(f'\nTokenizacja wielkich liter: {fn_corpus_caps.name}')
        self.processor.do_caps_file(fn_corpus_char, fn_corpus_caps)
        if verbose:
            print_head(fn_corpus_caps)

    def stem_corpus(self, fn_corpus_caps: Path, fn_corpus_syl: Path, stem_delim: str, verbose: True):
        if verbose:
            print(f'\nPodział korpusu na sylaby "{stem_delim}"')
        Stemmer.stem_file(fn_corpus_caps, fn_corpus_syl, stem_delim=stem_delim)
        if verbose:
            print_head(fn_corpus_syl)

    # def load_and_create_vocab(self, fn_corpus_syl: Path, vocab_path: Path) -> List[str]:
    #     # Załadowanie do pamięci i tokenizacja
    #     if fn_corpus_syl.is_dir():
    #         file_tok = flatten([self.processor.load_and_tokenize_file(x, repl_unk=False) for x in fn_corpus_syl.glob('*.txt')])
    #     else:
    #         file_tok = self.processor.load_and_tokenize_file(fn_corpus_syl, repl_unk=False)

    #     # create & save vocab
    #     self.tokenizer.create_vocab(file_tok)
    #     self.tokenizer.save_vocab(vocab_path)
    #     return file_tok

    def create_sampled_file(self, file_tok: List[str], fn_corpus_sampled: Path, min_n_samples: int, max_n_samples=None, chunk_len=100):
        print(f"\nLet's make dataset with more than minimum {min_n_samples} samples")
        line_chunker = LineChunker(file_tok=file_tok, chunk_len=chunk_len)
        n_samples = len(file_tok) // chunk_len
        print(f'n_samples: {n_samples}')
        n_samples = max(min_n_samples, n_samples)
        if max_n_samples is not None:
            n_samples = min(max_n_samples, n_samples)
        print(f'chunk_len: {chunk_len}')
        print(f'n_samples: {n_samples}')

        sampled_chunks = [" ".join(line_chunker.random_chunk()) for _ in tqdm(range(n_samples))]
        fn_corpus_sampled.write_text("\n".join(sampled_chunks))
        print(fn_corpus_sampled)




In [None]:
caps_path = dataset_path / 'caps'
syl_path = dataset_path / 'syl'
sampled_path = dataset_path / 'sampled'

caps_path.mkdir(parents=True, exist_ok=True)
syl_path.mkdir(parents=True, exist_ok=True)
sampled_path.mkdir(parents=True, exist_ok=True)

print('Files to preprocess:')
paths = [fn_corpus_char]
# paths = [x for x in dataset_path.glob("**/*.txt")]
pprint(paths)

preprocessor = DatasetPreprocessor(dataset_path, tokenizer=text_tokenizer)

for char_path in paths:
    print(f'tokenizing caps and stemming: {char_path.name}')
    corpus_caps_path = caps_path / f'{char_path.stem}.caps1.txt'
    corpus_syl_path = syl_path / f'{char_path.stem}.syl1.txt'

    preprocessor.tokenize_caps(char_path, corpus_caps_path, verbose=False)
    preprocessor.stem_corpus(corpus_caps_path, corpus_syl_path, stem_delim=stem_delim, verbose=False)

In [None]:
text = 'LITWO! Ojczyzno moja!\nTy jesteś jak zdrowie.\nIle cię trzeba cenić ble ble '
print(f'\nTesting tokenizer: {text}')
text_tok = text_tokenizer.str2syl2tok(text, stem_delim=stem_delim)
print(text_tok)

print(text_tokenizer.syl2str(text_tok, stem_delim=stem_delim))
text_decoded = text_tokenizer.decode_caps(text_tokenizer.syl2str(text_tok, delim='', stem_delim=stem_delim))[:300]
print(text_decoded)
e_str = text_tokenizer.fix_punctuation(text_decoded)[:400]
print(e_str)
print(text_tokenizer.format_html(e_str))

In [None]:
# Sample dataset

In [None]:
fn_corpus_syl = syl_path / 'pan_tadeusz.syl1.txt'
file_tok = preprocessor.processor.load_and_tokenize_file(fn_corpus_syl, repl_unk=True)

min_n_samples = 10000  # 50000
max_n_samples = None
chunk_len = 100  # 400
fn_corpus_sampled = sampled_path / f'dataset.sampled1.{max_n_samples}.txt'
preprocessor.create_sampled_file(file_tok, fn_corpus_sampled, min_n_samples=min_n_samples, max_n_samples=max_n_samples, chunk_len=chunk_len)

In [None]:
# fine tune model

In [None]:
# evaluate model

In [None]:
models_path = Path('/content/poetry2021/data/pan_tadeusz7_lm_models')
model_path = models_path / 'model1000'

In [None]:
USE_GPU = torch.cuda.is_available()
# USE_GPU = False
print(f'USE_GPU={USE_GPU}')

In [None]:
def to_gpu(x, *args, **kwargs):
    return x.cuda(*args, **kwargs) if USE_GPU else x

In [None]:
model = GPT2LMHeadModel.from_pretrained(str(model_path))
model = to_gpu(model)
model.device

# generate
model.eval();

In [None]:
def print_eval(generated):
    # print(f'bad_words: {bad_words(generated)}')
    e_syl = generated.split(' ')
    decoded = text_tokenizer.decode_caps(text_tokenizer.syl2str(e_syl, delim=''))
    print(text_tokenizer.fix_punctuation(decoded))
    # display(HTML(text_tokenizer.format_html(text_tokenizer.fix_punctuation(decoded))))


def evaluate(prime_str, max_length=100, temperature=0.8, num_beams=10, 
             early_stopping=True, no_repeat_ngram_size=2, do_sample=True, 
             top_k=50, top_p=0.92):
    prime_tok = text_tokenizer.str2syl2tok(prime_str)
    prime_tok_str = " ".join(prime_tok)
    ids = tokenizer.encode(prime_tok_str, return_tensors="pt")[:, :-1]
    preds = model.generate(ids.to(model.device), max_length=max_length,
                           temperature=temperature,
                           num_beams=num_beams, early_stopping=early_stopping,
                           no_repeat_ngram_size=no_repeat_ngram_size,
                           do_sample=do_sample,
                           top_k=top_k,
                           top_p=top_p
                           )
    return tokenizer.decode(preds[0])


max_length = 500
# gen1 = evaluate('chwycił na taśmie przypięty', max_length=max_length, temperature=1.0)
# print_eval(gen1)
# gen1

In [None]:
print_eval(evaluate('Litwo! Ojczyzno', max_length=max_length, temperature=1.0))