In [1]:
from pathlib import Path

ROOT_PATH = Path.cwd().parent.parent

RAW_TRAIN_DATA_PATH = "dataset/raw_data/train.txt"
RAW_TEST_DATA_PATH = "dataset/raw_data/test.txt"

TRAIN_DATA_PATH = "dataset/ner_data/train.data"
TEST_DATA_PATH = "dataset/ner_data/test.data"

TRAIN_GRAINED_DATA_PATH = "dataset/ner_data/train_grained.data"
TEST_GRAINED_DATA_PATH = "dataset/ner_data/test_grained.data"

MODEL = [
    "CRF",
    "BILSTM_CRF",
    "BERT_CRF",
    "BERT_BILSTM_CRF"
]

MODEL_SELECT = 3

%set_env PYTHONPATH=$ROOT_PATH

env: PYTHONPATH=d:\PROJECTS\Python\Deidentification-of-medical-data


In [2]:
# Generate train, test NER format Data

!python data_generator.py \
            --RAW_TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_DATA_PATH \
            --RAW_TEST_DATA_PATH=$ROOT_PATH/$TEST_DATA_PATH \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --TEST_DATA_PATH=$ROOT_PATH/$TEST_GRAINED_DATA_PATH \
            --OUTPUT_TYPE=split


Split train data generated.
Split test data generated.


In [3]:
# Preprocess and generate trainable datasets

!python data_preprocessor.py \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --TEST_DATA_PATH=$ROOT_PATH/$TEST_GRAINED_DATA_PATH \
            --RAW_TEST_DATA_PATH=$ROOT_PATH/$RAW_TEST_DATA_PATH \
            --MODEL_DATA_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/data/


Train text data & labels array saved
Test text data and mapping array saved


In [4]:
# Tokenize and training process, use the dataset pickled from data_preprocessor

!python ner_trainer.py \
            --MODEL={MODEL[MODEL_SELECT]} \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --MODEL_DATA_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/data/ \
            --CHECKPOINT_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/checkpoint/ \
            --CHECKPOINT_KEEP=3 \
            --SENTENCE_MAX_LENGTH=32 \
            --BATCH_SIZE=64 \
            --EMBEDDING_SIZE=300 \
            --HIIDEN_NUMS=512 \
            --EPOCHS=1 \
            --LEARNING_RATE=1e-3


2020-12-28 16:13:15.864527: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2020-12-28 16:13:26.910097: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2020-12-28 16:13:26.917688: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library nvcuda.dll
2020-12-28 16:13:26.988559: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:06:00.0 name: GeForce RTX 2060 computeCapability: 7.5
coreClock: 1.68GHz coreCount: 30 deviceMemorySize: 6.00GiB deviceMemoryBandwidth: 312.97GiB/s
2020-12-28 16:13:26.989275: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2020-12-28 16:13:27.626970: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2020-12-28

In [None]:
# Predicting process and export the results, use the model generated from training checkpoints 

!python ner_predictor.py \
            --
