Fine tuning eduge dataset for news classification.

In [1]:
TEXT = "Монгол бахархлын өдөр буюу Их эзэн Чингис хааны мэндэлсэн өдөр өчигдөр тохиов Эрдэмтэд Чингис хааны мэндэлсэн өдрийг билгийн тооллын гуравдугаар жарны усан морин жил буюу 1162 оны өвлийн тэргүүн сарын шинийн 1-ний өдөр хэмээн тогтоосон байдаг"

MODEL            = 'model-32k'
MODEL_CHECKPOINT = 'model.ckpt-1000000'
MODEL_BUCKET     = 'gs://mongolian-bert-models/model-32k-1000000'

if False:
  # for non public GCloud bucket
  from google.colab import auth
  auth.authenticate_user()

import getpass
GITHUB_USERNAME = getpass.getpass(prompt='Хэн бээ   : ')
GITHUB_PASSWORD = getpass.getpass(prompt='Нууц үгээ : ')

Хэн бээ   : ··········
Нууц үгээ : ··········


In [0]:
import os
from os.path import exists, join, basename, splitext

is_on_colab = True
project_path = 'mongolian-bert'
try:
  import colab
except ModuleNotFoundError:
  is_on_colab = False
  project_path = '../../mongolian-bert'
  
import sys
sys.path.append(project_path)

In [0]:
if is_on_colab:
  # we are on Colab, clone our project
  if not exists(project_path):
    print("checking out")
    !git clone -q --recursive https://$GITHUB_USERNAME:$GITHUB_PASSWORD@github.com/tugstugi/mongolian-bert.git

In [4]:
!pip install -r ./mongolian-bert/requirements.txt



In [5]:
from tokenization_sentencepiece import FullTokenizer
model_path = join(project_path, MODEL)
tokenizer  = FullTokenizer(model_file=join(model_path, 'mn_cased.model'), vocab_file=join(model_path, 'mn_cased.vocab'), do_lower_case=False)

Loaded a trained SentencePiece model.


In [6]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.16.5.146:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 14370875309814402812),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 10299567020744913576),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 4024638110382937364),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 780150789284767718),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 10932476537314582073),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 11846432860433492589),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 5429083560763888891),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 6367874777643169032),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 12444070535845

In [7]:
BERT_PRETRAINED_DIR = MODEL_BUCKET
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: gs://mongolian-bert-models/model-32k-1000000 *****
gs://mongolian-bert-models/model-32k-1000000/checkpoint
gs://mongolian-bert-models/model-32k-1000000/eval_results.txt
gs://mongolian-bert-models/model-32k-1000000/events.out.tfevents.1553340038.mongolian-bert
gs://mongolian-bert-models/model-32k-1000000/events.out.tfevents.1553514967.mongolian-bert
gs://mongolian-bert-models/model-32k-1000000/graph.pbtxt
gs://mongolian-bert-models/model-32k-1000000/model.ckpt-1000000.data-00000-of-00001
gs://mongolian-bert-models/model-32k-1000000/model.ckpt-1000000.index
gs://mongolian-bert-models/model-32k-1000000/model.ckpt-1000000.meta
gs://mongolian-bert-models/model-32k-1000000/eval/


In [8]:
BUCKET = 'mongolian-bert-models'
TASK   = 'eduge'
OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, TASK)
print(OUTPUT_DIR)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

gs://mongolian-bert-models/eduge
***** Model output directory: gs://mongolian-bert-models/eduge *****


In [0]:
sys.path.append("./mongolian-bert/bert")
import modeling
import optimization
import run_classifier
import tokenization

In [0]:
VOCAB_FILE      = os.path.join("./mongolian-bert/{}".format(MODEL), 'mn_cased.vocab')
CONFIG_FILE     = os.path.join("./mongolian-bert/{}".format(MODEL), 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, MODEL_CHECKPOINT)
DO_LOWER_CASE   = False

In [11]:
import os
if not os.path.exists("eduge.csv.gz"):
  !wget https://github.com/tugstugi/mongolian-nlp/raw/master/datasets/eduge.csv.gz
  !gunzip eduge.csv.gz

--2019-03-26 17:40:32--  https://github.com/tugstugi/mongolian-nlp/raw/master/datasets/eduge.csv.gz
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tugstugi/mongolian-nlp/master/datasets/eduge.csv.gz [following]
--2019-03-26 17:40:32--  https://raw.githubusercontent.com/tugstugi/mongolian-nlp/master/datasets/eduge.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73951267 (71M) [application/octet-stream]
Saving to: ‘eduge.csv.gz’


2019-03-26 17:40:33 (146 MB/s) - ‘eduge.csv.gz’ saved [73951267/73951267]

gzip: eduge.csv already exists; do you wish to overwrite (y or n)? y


In [12]:
import pandas as pd
df = pd.read_csv("eduge.csv")
df = df.rename(columns=lambda x: x.strip())
df.columns.values
labels = df['label'].unique().tolist()
labels


['урлаг соёл',
 'эдийн засаг',
 'эрүүл мэнд',
 'хууль',
 'улс төр',
 'спорт',
 'технологи',
 'боловсрол',
 'байгал орчин']

In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [0]:
DATA_COLUMN  = 'news'
LABEL_COLUMN = 'label'
label_list   = labels

In [0]:
train_InputExamples = train.apply(lambda x: run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label  = x[LABEL_COLUMN]), axis = 1)
test_InputExamples  = test.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label  = x[LABEL_COLUMN]), axis = 1)

In [0]:
LEARNING_RATE          = 2e-5
NUM_TRAIN_EPOCHS       = 3.0
WARMUP_PROPORTION      = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS     = 100
TRAIN_BATCH_SIZE       = 32
EVAL_BATCH_SIZE        = 8
PREDICT_BATCH_SIZE     = 4
ITERATIONS_PER_LOOP    = 1000
NUM_TPU_CORES          = 1
MAX_SEQ_LENGTH         = 128

In [17]:
train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features  = run_classifier.convert_examples_to_features(test_InputExamples , label_list, MAX_SEQ_LENGTH, tokenizer)

num_train_steps  = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

INFO:tensorflow:Writing example 0 of 60528
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] ▁ - Энэ ▁томилгоог ▁мөнгөний ▁үнэр ▁дагадаг аас ▁дэд ▁сайдуудын ▁суудлыг ▁үнэлэмж гүй , ▁үнэ ▁цэнэ гүй ▁болгож ▁орхисон ▁нь ▁харамсалтай - ▁Төрийн ▁албан ▁тушаал , ▁тэр ▁тусмаа ▁дэд ▁сайдуудын ▁томилгоо ▁ерөнхийдөө ▁хувь ▁хүний ▁мэдлэг ▁чадвар , ▁мэргэжил ▁болов с ­ рол , ▁мэргэшил , ▁туршлага ▁гэхээсээ ▁илүү ▁бялуу ▁хуваарилалт ▁хэлбэрээр ▁явагддаг ▁болоод ▁удаж ▁байна . ▁Нийгэм , ▁эдийн ▁засгийг ▁хямралаас ▁гаргах ▁үүрэг тэйгээр ▁парламентад ▁суудалтай ▁бүх ▁нам ▁хамтарч ▁байгуулсан ▁Ч . Сайханбилэгийн ▁танхим ▁ч ▁энэ ▁жишгээс ▁гаж сан гүй . ▁Шийдлийн ▁Засгийн ▁газар ▁15 ▁яам , ▁19 ▁сайд , ▁17 ▁дэд ▁сайдтай . ▁Сайд ууд ▁эзгүй д ▁гадаад , ▁дотоодын ▁арга ▁хэмжээнд ▁дэд үүд ▁нь ▁тэднийг ▁орлон о . ▁Тэдний ▁хийх ▁ажлын ▁чиг ▁үүр ­ гийг ▁хуульч лаад ▁өгчихсөн ▁нь ▁бий . ▁Тодруул ­ бал [SEP]
INFO:tensorflow:input_ids: 4 10 12 1632 14015 1327 3777 16508 69 295

In [18]:
print(CONFIG_FILE)

./mongolian-bert/model-32k/bert_config.json


In [19]:
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
run_config = tf.contrib.tpu.RunConfig(
    cluster   = tpu_cluster_resolver,
    model_dir = OUTPUT_DIR,
    save_checkpoints_steps = SAVE_CHECKPOINTS_STEPS,
    tpu_config = tf.contrib.tpu.TPUConfig(
        iterations_per_loop         = ITERATIONS_PER_LOOP,
        num_shards                  = NUM_TPU_CORES,
        per_host_input_for_training = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

model_fn = run_classifier.model_fn_builder(
    bert_config      = modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels       = len(label_list),
    init_checkpoint  = INIT_CHECKPOINT,
    learning_rate    = LEARNING_RATE,
    num_train_steps  = num_train_steps,
    num_warmup_steps = num_warmup_steps,
    use_tpu          = True,
    use_one_hot_embeddings = True)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu            = True,
    model_fn           = model_fn,
    config             = run_config,
    train_batch_size   = TRAIN_BATCH_SIZE,
    eval_batch_size    = EVAL_BATCH_SIZE,
    predict_batch_size = PREDICT_BATCH_SIZE)

INFO:tensorflow:Using config: {'_model_dir': 'gs://mongolian-bert-models/eduge', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.16.5.146:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd6bae07f28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.16.5.146:8470', '_evaluation_master': 'grpc://10.16.5.146:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=1, num_cores_per_repl

In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features       = train_features,
    seq_length     = MAX_SEQ_LENGTH,
    is_training    = True,
    drop_remainder = True)

In [21]:
# Listen to some trap music while it's training.
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/thQ0sCm5MVA" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')

In [0]:
from datetime import datetime

should_train = False
if should_train:
  print("Beginning Training!")
  current_time = datetime.now()
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  print("Training took time ", datetime.now() - current_time)

In [0]:
test_input_fn = run_classifier.input_fn_builder(
    features       = test_features,
    seq_length     = MAX_SEQ_LENGTH,
    is_training    = False,
    drop_remainder = True)

In [24]:
estimator.evaluate(input_fn = test_input_fn, steps=1)

Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (8, 128)
INFO:tensorflow:  name = input_mask, shape = (8, 128)
INFO:tensorflow:  name = label_ids, shape = (8,)
INFO:tensorflow:  name = segment_ids, shape = (8, 128)
Instructions for updating:
Use keras.layers.dense instead.
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (32000, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/sel

{'eval_accuracy': 0.875,
 'eval_loss': 0.11637277,
 'global_step': 5674,
 'loss': 0.11637277}

In [0]:
def getPrediction(in_sentences):
  labels = ['урлаг соёл', 'эдийн засаг', 'эрүүл мэнд', 'хууль', 'улс төр', 'спорт', 'технологи', 'боловсрол', 'байгал орчин']
  input_examples   = [run_classifier.InputExample(guid=None, text_a = x, text_b = None, label = "урлаг соёл") for x in in_sentences]
  input_features   = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
  predictions      = estimator.predict(predict_input_fn)
  return predictions

In [0]:
pred_sentences = [
  "Монгол улсын уул уурхайн гарц энэ жилээс эхлэн нэмэгдэж дэлхийн эдийн засаг сэргэсний азаар эдийн засагт таатай сайхан нөхцөл байдал үүсээд байна.",
  "Хиймэл оюун болоод дуу хоолой таних систем ухаалаг туслагчийн үр нөлөөгөөр ухаалаг утас хэрэглэгчдийн тоо эрс нэмэгджээ.",
  "Гурвыг харьцах гурав заалны сагсан бөмбөгийн спортод манай тамирчид одтой байна.",
  "Ханиад томуунаас урьдчилан сэргийлэхийн тулд биеэ чийрэгжүүлэх шаардлагатай."    
]

In [0]:
import numpy as np

In [46]:
predictions = getPrediction(pred_sentences)
for (sent, pred) in zip(pred_sentences, predictions):
  print("==============SENTENCE===============")
  print(sent)
  probabilities = pred["probabilities"]
  softmax       = np.exp(probabilities)/sum(np.exp(probabilities))
  label_index   = np.argmax(softmax)
  #print(probabilities)
  #print(softmax)
  #print(label_index)
  print(label_list[label_index])

INFO:tensorflow:Writing example 0 of 4
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] ▁Монгол ▁улсын ▁уул ▁уурхайн ▁гарц ▁энэ ▁жилээс ▁эхлэн ▁нэмэгдэж ▁дэлхийн ▁эдийн ▁засаг ▁сэргэ сний ▁азаар ▁эдийн ▁засагт ▁таатай ▁сайхан ▁нөхцөл ▁байдал ▁үүсээд ▁байна . [SEP]
INFO:tensorflow:input_ids: 4 29 44 580 682 1911 22 2643 831 1219 272 88 335 11703 2339 21142 88 1550 1300 278 368 205 4346 11 7 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 