In [1]:
# tensorflow 버전 확인
!pip3 list | grep tensorflow

tensorflow                         1.15.2
tensorflow-estimator               1.15.1
tensorflow-gpu                     1.15.2
tensorflow-serving-api             1.15.0


In [2]:
# GPU 확인
!nvidia-smi

Fri Oct 29 23:21:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  On   | 00000000:00:06.0 Off |                    0 |
| N/A   31C    P0    27W / 250W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:00:07.0 Off |                    0 |
| N/A   29C    P0    26W / 250W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [3]:
# tokenizers 설치 및 google-bert clone
!pip install -q tokenizers
!git clone -q https://github.com/google-research/bert

# Tensorflow GPU 확인

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())
print('GPU 사용여부: ', tf.test.is_gpu_available())

# 토크나이저 학습

- 전체 데이터 사용
- vocab_size = 30000

In [5]:
from tokenizers import BertWordPieceTokenizer

In [6]:
tokenizer = BertWordPieceTokenizer(lowercase=False)
tokenizer.train(['./data/dataset.txt'], vocab_size=30000, show_progress=True, limit_alphabet=3000)
tokenizer.save_model('./data')

# 사전학습 데이터 생성하기

In [1]:
!mkdir ./shards 

In [2]:
# 학습 데이터셋 분할
# -a : 분할될 파일이름 접미사에 붙을 길이
# -l : 라인 수 기준으로 파일 분할
# -d : 분할될 파일이름과 저장될 파일이름 지정
!split -a 3 -l 4800000 -d ./data/dataset.txt ./shards/shard_

In [None]:
!mkdir ./pretraining_data

In [None]:
!ls ./shards/ | xargs -n 1 -P 4 -I{} python bert/create_pretraining_data.py --input_file=./shards/{} --output_file=./pretraining_data/{}.tfrecord --vocab_file=./data/vocab.txt --do_lower_case=False --max_seq_length=128 --max_predictions_per_seq=20 --masked_lm_prob=0.15 --random_seed=12345 --dupe_factor=5

# BERT MODEL CONFIG 파일 생성

In [3]:
import json

In [6]:
MODEL_DIR = "bert_model" 
tf.gfile.MkDir(MODEL_DIR)

In [7]:
bert_base_config = {
    "attention_probs_dropout_prob": 0.1, 
    "directionality": "bidi", 
    "hidden_act": "gelu", 
    "hidden_dropout_prob": 0.1, 
    "hidden_size": 768, 
    "initializer_range": 0.02, 
    "intermediate_size": 3072, 
    "max_position_embeddings": 512, 
    "num_attention_heads": 12, 
    "num_hidden_layers": 12, 
    "pooler_fc_size": 768, 
    "pooler_num_attention_heads": 12, 
    "pooler_num_fc_layers": 3, 
    "pooler_size_per_head": 128, 
    "pooler_type": "first_token_transform", 
    "type_vocab_size": 2, 
    "vocab_size": 30000,
    "model_type": "bert-base",
    "architectures": ["BertForMaskedLM"]
  }

with open("{}/config.json".format(MODEL_DIR), "w") as fo:
    json.dump(bert_base_config, fo, indent=2)

# BERT 사전학습

In [None]:
# 테스트로 1만번 학습 

!CUDA_VISIBLE_DEVICES=0 python bert/run_pretraining.py --input_file=../pretrain_test/src/pretraining_data/*.tfrecord --output_dir=./bert_model --do_train=True --do_eval=True --bert_config_file=./bert_model/config.json --train_batch_size=32 --max_seq_length=128 --max_predictions_per_seq=20 --num_train_steps=10000 --num_warmup_steps=10 --save_checkpoints_steps=1000 --learning_rate=2e-5