<a href="https://colab.research.google.com/github/trybalad/BERT/blob/master/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Instalacja potrzebnych zależności

In [None]:
!pip install spacy
!python -m spacy download pl_core_news_lg

Jeśli szkolimy z gpu

In [None]:
!pip install tensorflow-gpu

Jeśli wykorzystujemy cpu

In [None]:
!pip install tensorflow

Import potrzebnych plików i przygotowanie ustawień modelu

In [None]:
from keras_bert.data_generator import DataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.training import train_model


max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

checkpoint_file_path = "./data/checkpoint_notebook.ckpt"
load_checkpoint = False

Utworzenie słownika na bazie dokumentu i zapisanie go w pliku

In [None]:
tokenizer = Tokenizer()
print("Preparing vocab.")
tokenizer.prepare_vocab("./data/corpus_clean.txt", './data/vocab.txt')
print("Vocab of size:", tokenizer.vocab_size, "created.")

Wczytanie słownika z pliku

In [None]:
print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

Utworzenie generatora danych treningowych

In [None]:
print("Creating data generator")
data_generator = DataGenerator("./data/corpus_clean.txt", max_len, tokenizer, batch_size=64, create_nsr_output=True)
print("Data generator prepared.")

Stworzenie modelu

In [None]:
print("Preparing model")
sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, encoder_num, heads, ff_dim)
print("Model created.")

Rozpoczęcie procesu treningu wstępnego

In [None]:
print("Start training.")
train = train_model(sequence_encoder, max_len, tokenizer, data_generator, epochs=100,
                    checkpoint_file_path=checkpoint_file_path, load_checkpoint=load_checkpoint)

Testing model by metric of masked/replaced words correctly predicted

In [None]:
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
import numpy as np
from keras_bert.prepare_data import create_tokens, create_masks, create_ids, create_segments, translate_ids, \
    create_pretrain_data
from keras_bert.training import prepare_pretrain_model_from_checkpoint
import codecs

print("Test on validation data to compare with google-Keras metric.")

print("Reading vocab.")
validation_tokenizer = Tokenizer()
validation_tokenizer.read_vocab('./data/counted_vocab.txt')
validation_tokenizer.change_to_reversible()
print("Vocab of size:", validation_tokenizer.vocab_size, "loaded.")

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4
old_checkpoint = "./data/checkpoint_test11.ckpt"
text_file = "./data/corpus_clean.txt"

print("Preparing model.")
sequence_encoder = create_model(validation_tokenizer.vocab_size, max_len, embedding_dim, encoder_num, heads, ff_dim)
model = prepare_pretrain_model_from_checkpoint(sequence_encoder, validation_tokenizer,  load_checkpoint=True, old_checkpoint=old_checkpoint)
print("Model created.")

print("Starting test.")
file = codecs.open(text_file, 'r', 'utf-8')

message = file.readline()
count = 0
sum_points = 0
sum_correct = 0
while message:
    count += 1
    tokens = create_tokens([message], validation_tokenizer, max_len)
    mlm_tokens = create_pretrain_data(tokens, validation_tokenizer)
    ids = create_ids(mlm_tokens, max_len, validation_tokenizer)
    mask = create_masks(mlm_tokens, max_len)
    segments = create_segments(mlm_tokens, max_len)
    
    result = model.predict(x = [np.array(ids), np.array(segments), np.array(mask)])
    prediction = translate_ids(result, validation_tokenizer)
    
    good = 0
    all_points = 0
    for i in range(0, len(tokens)):
        for j in range(0, len(tokens[i])):
            if tokens[i][j] != mlm_tokens[i][j]:
                all_points += 1
                if tokens[i][j] == prediction[j]:
                    good += 1
                
        sum_correct += good
        sum_points += all_points
        
        if sum_points != 0 and count%100 == 0:
            print(count,"\t",(sum_correct/sum_points))
        message = file.readline()
print(sum_correct/sum_points)

Fine-Tune process -- AR

In [None]:
from keras_bert.tuning.tuning_data_generator import TuningDataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.tuning.fine_tuning import fine_tune

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

epochs = 10
learn_type = "ar"
old_checkpoint = "./data/checkpoint_test10.ckpt"
new_checkpoint = "./data/ar/checkpoint_ar.ckpt"
text_file = "./data/ar/train.tsv"

print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

data_generator = TuningDataGenerator(text_file, max_len, tokenizer, batch_size=64, tuning_type=learn_type)
print("Data generator prepared.")

sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, heads, encoder_num, ff_dim)
print("Model created.")

# Start training.
train = fine_tune(sequence_encoder, tokenizer, max_len, data_generator, epochs=epochs,
                    checkpoint_file_path=new_checkpoint, load_checkpoint=True, old_checkpoint=old_checkpoint, learn_type=learn_type)

Fine-Tune process -- CBD

In [None]:
from keras_bert.tuning.tuning_data_generator import TuningDataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.tuning.fine_tuning import fine_tune

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

epochs = 10
learn_type = "cbd"
old_checkpoint = "./data/checkpoint_test10.ckpt"
new_checkpoint = "./data/klej_cbd/checkpoint_cbd.ckpt"
text_file = "./data/klej_cbd/train.tsv"

print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

data_generator = TuningDataGenerator(text_file, max_len, tokenizer, batch_size=64, tuning_type=learn_type)
print("Data generator prepared.")

sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, heads, encoder_num, ff_dim)
print("Model created.")

# Start training.
train = fine_tune(sequence_encoder, tokenizer, max_len, data_generator, epochs=epochs,
                    checkpoint_file_path=new_checkpoint, load_checkpoint=True, old_checkpoint=old_checkpoint, learn_type=learn_type)

Fine-Tune process -- CDSC

In [None]:
from keras_bert.tuning.tuning_data_generator import TuningDataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.tuning.fine_tuning import fine_tune

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

epochs = 10
learn_type = "cdsc"
old_checkpoint = "./data/checkpoint_test10.ckpt"
new_checkpoint = "./data/klej_cdsc-e/checkpoint_cdsc.ckpt"
text_file = "./data/klej_cdsc-e/train.tsv"

print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

data_generator = TuningDataGenerator(text_file, max_len, tokenizer, batch_size=64, tuning_type=learn_type)
print("Data generator prepared.")

sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, heads, encoder_num, ff_dim)
print("Model created.")

# Start training.
train = fine_tune(sequence_encoder, tokenizer, max_len, data_generator, epochs=epochs,
                    checkpoint_file_path=new_checkpoint, load_checkpoint=True, old_checkpoint=old_checkpoint, learn_type=learn_type)

Fine-Tune process -- DYK

In [None]:
from keras_bert.tuning.tuning_data_generator import TuningDataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.tuning.fine_tuning import fine_tune

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

epochs = 10
learn_type = "dyk"
old_checkpoint = "./data/checkpoint_test10.ckpt"
new_checkpoint = "./data/klej_dyk/checkpoint_dyk.ckpt"
text_file = "./data/klej_dyk/train.tsv"

print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

data_generator = TuningDataGenerator(text_file, max_len, tokenizer, batch_size=64, tuning_type=learn_type)
print("Data generator prepared.")

sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, heads, encoder_num, ff_dim)
print("Model created.")

# Start training.
train = fine_tune(sequence_encoder, tokenizer, max_len, data_generator, epochs=epochs,
                    checkpoint_file_path=new_checkpoint, load_checkpoint=True, old_checkpoint=old_checkpoint, learn_type=learn_type)

In [None]:
from keras_bert.tuning.tuning_data_generator import TuningDataGenerator
from keras_bert.model import create_model
from keras_bert.tokenizer import Tokenizer
from keras_bert.tuning.fine_tuning import load_pretrained_model

max_len = 32
embedding_dim = 512
ff_dim = 512
heads = 4
encoder_num = 4

learn_type = "dyk"
old_checkpoint = "./data/klej_dyk/checkpoint_dyk.ckpt"
text_file = "./data/klej_dyk/dev.tsv"

print("Reading vocab.")
tokenizer = Tokenizer()
tokenizer.read_vocab('./data/counted_vocab.txt')
tokenizer.change_to_reversible()
print("Vocab of size:", tokenizer.vocab_size, "loaded.")

data_generator = TuningDataGenerator(text_file, max_len, tokenizer, batch_size=64, tuning_type=learn_type)
print("Data generator prepared.")

sequence_encoder = create_model(tokenizer.vocab_size, max_len, embedding_dim, heads, encoder_num, ff_dim)
print("Model created.")

model = load_pretrained_model(sequence_encoder,old_checkpoint,learn_type)

# Evaluate.
model.evaluate(x=data_generator)

