In [15]:
import pandas as pd

df = pd.read_csv('src/thwiki.csv')

df = df[~df.text.isna()]
txt = '\n'.join(df.text.values)

with open('src/thwiki.txt','w') as f:
    f.write(txt)
    f.close()

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [16]:
%%time 
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=['src/thwiki.txt'], vocab_size=20_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 8min 1s, sys: 46.9 s, total: 8min 48s
Wall time: 5min 1s


In [32]:
tokenizer.save("ThaiBert")

['ThaiBert/vocab.json', 'ThaiBert/merges.txt']

In [33]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./ThaiBert/vocab.json",
    "./ThaiBert/merges.txt",
)

In [34]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [35]:
test_txt = "สวัสดีชาวโลก วันนี้เราจะมาเทรน model BERT กันโดยใช้ library Huggingface"
tokenizer.encode(test_txt).tokens

['<s>',
 'à¸ªà¸§',
 'à¸±',
 'à¸ªà¸Ķ',
 'à¸µ',
 'à¸Ĭà¸²à¸§',
 'à¹Ĥà¸¥à¸ģ',
 'Ġà¸§',
 'à¸±',
 'à¸Ļà¸Ļ',
 'à¸µà¹ī',
 'à¹Ģà¸£à¸²à¸Īà¸°',
 'à¸¡à¸²',
 'à¹Ģà¸Ĺà¸£',
 'à¸Ļ',
 'Ġmod',
 'el',
 'ĠB',
 'ER',
 'T',
 'Ġà¸ģ',
 'à¸±',
 'à¸Ļà¹Ĥà¸Ķà¸¢à¹ĥà¸Ĭ',
 'à¹ī',
 'Ġli',
 'br',
 'ary',
 'ĠH',
 'ug',
 'g',
 'ing',
 'face',
 '</s>']

In [36]:
tokenizer.decode(tokenizer.encode(test_txt).ids)

'<s>สวัสดีชาวโลก วันนี้เราจะมาเทรน model BERT กันโดยใช้ library Huggingface</s>'

In [37]:
import json
config = {
    "architectures": [
        "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 514,
    "model_type": "roberta",
    "num_attention_heads": 12,
    "num_hidden_layers": 6,
    "type_vocab_size": 1,
    "vocab_size": 25_000
}
with open("./ThaiBert/config.json", 'w') as fp:
    json.dump(config, fp)

tokenizer_config = {
    "max_len": 512
}
with open("./ThaiBert/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [40]:
cmd ="""
    python run_language_modeling.py
    --train_data_file ./src/thwiki.txt
    --output_dir ./ThaiBertModel
    --model_type roberta
    --mlm
    --config_name ./ThaiBert
    --tokenizer_name ./ThaiBert
    --do_train
    --learning_rate 1e-4
    --num_train_epochs 5
    --save_total_limit 2
    --save_steps 2000
    --per_gpu_train_batch_size 16
    """.replace("\n", " ")